Java 在InputStream中筛选(搜索和替换)字节数组
我有一个InputStream,它将html文件作为输入参数。我必须从输入流中获取字节 我有一个字符串:Java 在InputStream中筛选(搜索和替换)字节数组,java,input,bytearray,Java,Input,Bytearray,我有一个InputStream,它将html文件作为输入参数。我必须从输入流中获取字节 我有一个字符串:“XYZ”。我想将这个字符串转换成字节格式,并检查我从InputStream获得的字节序列中的字符串是否匹配。如果有,我必须用其他字符串的bye序列替换匹配 有人能帮我吗?我使用正则表达式查找和替换。然而,我不知道如何查找和替换字节流 以前,我使用jsoup解析html并替换字符串,但是由于一些utf编码问题,当我这样做时,文件似乎已损坏 TL;博士:我的问题是: 是一种在Java原始输入流中
“XYZ”
。我想将这个字符串转换成字节格式,并检查我从InputStream获得的字节序列中的字符串是否匹配。如果有,我必须用其他字符串的bye序列替换匹配
有人能帮我吗?我使用正则表达式查找和替换。然而,我不知道如何查找和替换字节流
以前,我使用jsoup解析html并替换字符串,但是由于一些utf编码问题,当我这样做时,文件似乎已损坏
TL;博士:我的问题是:
是一种在Java原始输入流中查找和替换字节格式字符串的方法吗?不确定您是否选择了解决问题的最佳方法 也就是说,我不喜欢用“不”来回答问题(而且我的政策是不这样做),所以就这样 看一看 从文件中: FilterInputStream包含一些其他输入流,它将其用作基本数据源,可能沿途转换数据或提供附加功能
写下来是一个有趣的练习。下面是一个完整的示例:
import java.io.*;
import java.util.*;
class ReplacingInputStream extends FilterInputStream {
LinkedList<Integer> inQueue = new LinkedList<Integer>();
LinkedList<Integer> outQueue = new LinkedList<Integer>();
final byte[] search, replacement;
protected ReplacingInputStream(InputStream in,
byte[] search,
byte[] replacement) {
super(in);
this.search = search;
this.replacement = replacement;
}
private boolean isMatchFound() {
Iterator<Integer> inIter = inQueue.iterator();
for (int i = 0; i < search.length; i++)
if (!inIter.hasNext() || search[i] != inIter.next())
return false;
return true;
}
private void readAhead() throws IOException {
// Work up some look-ahead.
while (inQueue.size() < search.length) {
int next = super.read();
inQueue.offer(next);
if (next == -1)
break;
}
}
@Override
public int read() throws IOException {
// Next byte already determined.
if (outQueue.isEmpty()) {
readAhead();
if (isMatchFound()) {
for (int i = 0; i < search.length; i++)
inQueue.remove();
for (byte b : replacement)
outQueue.offer((int) b);
} else
outQueue.add(inQueue.remove());
}
return outQueue.remove();
}
// TODO: Override the other read methods.
}
给定字符串“Hello xyz world”
的字节,它将打印:
Hello abc world
在字节流(
InputStream
)上没有任何用于搜索和替换的内置功能
而且,高效、正确地完成这项任务的方法目前还不明显。我已经为streams实现了Boyer-Moore算法,它工作得很好,但需要一些时间。如果没有这样的算法,你必须求助于蛮力方法,在那里你可能会很慢
即使您将HTML解码为文本,因为HTML不是“常规”语言
因此,尽管您遇到了一些困难,但我建议您继续使用原始方法将HTML解析为文档。当您在字符编码方面遇到问题时,从长远来看,修复正确的解决方案可能比修复错误的解决方案更容易。以下方法可以奏效,但我不知道对性能的影响有多大
InputStreamReader
包装InputStream
FilterReader
包装InputStreamReader
,然后ReaderInputStream
包装FilterReader
如果您想使用正则表达式替换字符串,那么您可以使用我的一个工具,它是
FilterReader
的一个方便的替代品。您可以在Streamflyer的网页上找到字节流的示例。希望这能有所帮助。我也需要类似的东西,并决定推出自己的解决方案,而不是使用@aioobe上面的示例。看一看这张照片。您可以从maven central中提取库,或者只是复制源代码
这就是你使用它的方式。在本例中,我使用了一个嵌套实例来替换两种模式,即两种fix dos和mac line Ending
新ReplacingInputStream(新ReplacingInputStream(是“\n\r”、“\n”)、“\r”、“\n”);
以下是完整的源代码:
/**
* Simple FilterInputStream that can replace occurrances of bytes with something else.
*/
public class ReplacingInputStream extends FilterInputStream {
// while matching, this is where the bytes go.
int[] buf=null;
int matchedIndex=0;
int unbufferIndex=0;
int replacedIndex=0;
private final byte[] pattern;
private final byte[] replacement;
private State state=State.NOT_MATCHED;
// simple state machine for keeping track of what we are doing
private enum State {
NOT_MATCHED,
MATCHING,
REPLACING,
UNBUFFER
}
/**
* @param is input
* @return nested replacing stream that replaces \n\r (DOS) and \r (MAC) line endings with UNIX ones "\n".
*/
public static InputStream newLineNormalizingInputStream(InputStream is) {
return new ReplacingInputStream(new ReplacingInputStream(is, "\n\r", "\n"), "\r", "\n");
}
/**
* Replace occurances of pattern in the input. Note: input is assumed to be UTF-8 encoded. If not the case use byte[] based pattern and replacement.
* @param in input
* @param pattern pattern to replace.
* @param replacement the replacement or null
*/
public ReplacingInputStream(InputStream in, String pattern, String replacement) {
this(in,pattern.getBytes(StandardCharsets.UTF_8), replacement==null ? null : replacement.getBytes(StandardCharsets.UTF_8));
}
/**
* Replace occurances of pattern in the input.
* @param in input
* @param pattern pattern to replace
* @param replacement the replacement or null
*/
public ReplacingInputStream(InputStream in, byte[] pattern, byte[] replacement) {
super(in);
Validate.notNull(pattern);
Validate.isTrue(pattern.length>0, "pattern length should be > 0", pattern.length);
this.pattern = pattern;
this.replacement = replacement;
// we will never match more than the pattern length
buf = new int[pattern.length];
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
// copy of parent logic; we need to call our own read() instead of super.read(), which delegates instead of calling our read
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
int c = read();
if (c == -1) {
return -1;
}
b[off] = (byte)c;
int i = 1;
try {
for (; i < len ; i++) {
c = read();
if (c == -1) {
break;
}
b[off + i] = (byte)c;
}
} catch (IOException ee) {
}
return i;
}
@Override
public int read(byte[] b) throws IOException {
// call our own read
return read(b, 0, b.length);
}
@Override
public int read() throws IOException {
// use a simple state machine to figure out what we are doing
int next;
switch (state) {
case NOT_MATCHED:
// we are not currently matching, replacing, or unbuffering
next=super.read();
if(pattern[0] == next) {
// clear whatever was there
buf=new int[pattern.length]; // clear whatever was there
// make sure we start at 0
matchedIndex=0;
buf[matchedIndex++]=next;
if(pattern.length == 1) {
// edgecase when the pattern length is 1 we go straight to replacing
state=State.REPLACING;
// reset replace counter
replacedIndex=0;
} else {
// pattern of length 1
state=State.MATCHING;
}
// recurse to continue matching
return read();
} else {
return next;
}
case MATCHING:
// the previous bytes matched part of the pattern
next=super.read();
if(pattern[matchedIndex]==next) {
buf[matchedIndex++]=next;
if(matchedIndex==pattern.length) {
// we've found a full match!
if(replacement==null || replacement.length==0) {
// the replacement is empty, go straight to NOT_MATCHED
state=State.NOT_MATCHED;
matchedIndex=0;
} else {
// start replacing
state=State.REPLACING;
replacedIndex=0;
}
}
} else {
// mismatch -> unbuffer
buf[matchedIndex++]=next;
state=State.UNBUFFER;
unbufferIndex=0;
}
return read();
case REPLACING:
// we've fully matched the pattern and are returning bytes from the replacement
next=replacement[replacedIndex++];
if(replacedIndex==replacement.length) {
state=State.NOT_MATCHED;
replacedIndex=0;
}
return next;
case UNBUFFER:
// we partially matched the pattern before encountering a non matching byte
// we need to serve up the buffered bytes before we go back to NOT_MATCHED
next=buf[unbufferIndex++];
if(unbufferIndex==matchedIndex) {
state=State.NOT_MATCHED;
matchedIndex=0;
}
return next;
default:
throw new IllegalStateException("no such state " + state);
}
}
@Override
public String toString() {
return state.name() + " " + matchedIndex + " " + replacedIndex + " " + unbufferIndex;
}
}
/**
*简单的FilterInputStream,可以用其他内容替换字节的出现。
*/
公共类ReplacingInputStream扩展FilterInputStream{
//匹配时,这是字节的位置。
int[]buf=null;
int matchedIndex=0;
int unbufferIndex=0;
int replacedIndex=0;
私有最终字节[]模式;
私有最终字节[]替换;
私有状态=状态。不匹配;
//用于跟踪我们正在做的事情的简单状态机
私有枚举状态{
不匹配,
匹配,
取代,
解除缓冲
}
/**
*@param已输入
*@return嵌套替换流,该流将\n\r(DOS)和\r(MAC)行结尾替换为UNIX行结尾“\n”。
*/
公共静态InputStream newLineNormalizingInputStream(InputStream为){
返回新的ReplacingInputStream(新的ReplacingInputStream(是“\n\r”、“\n”)、“\r”、“\n”);
}
/**
*替换输入中出现的模式。注意:输入假定为UTF-8编码。如果不是这种情况,则使用基于字节[]的模式和替换。
*输入中的@param
*@param要替换的模式。
*@param replacement将替换为空
*/
公共ReplacingInputStream(输入流输入、字符串模式、字符串替换){
这(在模式中,pattern.getBytes(StandardCharsets.UTF_8),replacement==null?null:replacement.getBytes(StandardCharsets.UTF_8));
}
/**
*替换输入中出现的模式。
*输入中的@param
*@param要替换的模式
*@param replacement将替换为空
*/
公共ReplacingInputStream(输入流输入,字节[]模式,字节[]替换){
超级(in),;
Validate.notNull(模式);
验证.isTrue(pattern.length>0,“pattern长度应该大于0”,pattern.length);
这个模式=模式;
这个。替换=替换;
//我们永远不会匹配超过图案长度的图案
buf=新整数[模式长度];
}
@凌驾
公共整数读取(字节[]b,整数关闭,整数长度)引发IOException{
//父逻辑的副本;我们需要调用我们自己的read()而不是super.read(),super.read()将委托而不是调用我们的read
如果(b==null){
抛出新的NullPointerException();
}else if(off<0 | | len<0 | | len>b.长度-off){
抛出新的索引
/**
* Simple FilterInputStream that can replace occurrances of bytes with something else.
*/
public class ReplacingInputStream extends FilterInputStream {
// while matching, this is where the bytes go.
int[] buf=null;
int matchedIndex=0;
int unbufferIndex=0;
int replacedIndex=0;
private final byte[] pattern;
private final byte[] replacement;
private State state=State.NOT_MATCHED;
// simple state machine for keeping track of what we are doing
private enum State {
NOT_MATCHED,
MATCHING,
REPLACING,
UNBUFFER
}
/**
* @param is input
* @return nested replacing stream that replaces \n\r (DOS) and \r (MAC) line endings with UNIX ones "\n".
*/
public static InputStream newLineNormalizingInputStream(InputStream is) {
return new ReplacingInputStream(new ReplacingInputStream(is, "\n\r", "\n"), "\r", "\n");
}
/**
* Replace occurances of pattern in the input. Note: input is assumed to be UTF-8 encoded. If not the case use byte[] based pattern and replacement.
* @param in input
* @param pattern pattern to replace.
* @param replacement the replacement or null
*/
public ReplacingInputStream(InputStream in, String pattern, String replacement) {
this(in,pattern.getBytes(StandardCharsets.UTF_8), replacement==null ? null : replacement.getBytes(StandardCharsets.UTF_8));
}
/**
* Replace occurances of pattern in the input.
* @param in input
* @param pattern pattern to replace
* @param replacement the replacement or null
*/
public ReplacingInputStream(InputStream in, byte[] pattern, byte[] replacement) {
super(in);
Validate.notNull(pattern);
Validate.isTrue(pattern.length>0, "pattern length should be > 0", pattern.length);
this.pattern = pattern;
this.replacement = replacement;
// we will never match more than the pattern length
buf = new int[pattern.length];
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
// copy of parent logic; we need to call our own read() instead of super.read(), which delegates instead of calling our read
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
int c = read();
if (c == -1) {
return -1;
}
b[off] = (byte)c;
int i = 1;
try {
for (; i < len ; i++) {
c = read();
if (c == -1) {
break;
}
b[off + i] = (byte)c;
}
} catch (IOException ee) {
}
return i;
}
@Override
public int read(byte[] b) throws IOException {
// call our own read
return read(b, 0, b.length);
}
@Override
public int read() throws IOException {
// use a simple state machine to figure out what we are doing
int next;
switch (state) {
case NOT_MATCHED:
// we are not currently matching, replacing, or unbuffering
next=super.read();
if(pattern[0] == next) {
// clear whatever was there
buf=new int[pattern.length]; // clear whatever was there
// make sure we start at 0
matchedIndex=0;
buf[matchedIndex++]=next;
if(pattern.length == 1) {
// edgecase when the pattern length is 1 we go straight to replacing
state=State.REPLACING;
// reset replace counter
replacedIndex=0;
} else {
// pattern of length 1
state=State.MATCHING;
}
// recurse to continue matching
return read();
} else {
return next;
}
case MATCHING:
// the previous bytes matched part of the pattern
next=super.read();
if(pattern[matchedIndex]==next) {
buf[matchedIndex++]=next;
if(matchedIndex==pattern.length) {
// we've found a full match!
if(replacement==null || replacement.length==0) {
// the replacement is empty, go straight to NOT_MATCHED
state=State.NOT_MATCHED;
matchedIndex=0;
} else {
// start replacing
state=State.REPLACING;
replacedIndex=0;
}
}
} else {
// mismatch -> unbuffer
buf[matchedIndex++]=next;
state=State.UNBUFFER;
unbufferIndex=0;
}
return read();
case REPLACING:
// we've fully matched the pattern and are returning bytes from the replacement
next=replacement[replacedIndex++];
if(replacedIndex==replacement.length) {
state=State.NOT_MATCHED;
replacedIndex=0;
}
return next;
case UNBUFFER:
// we partially matched the pattern before encountering a non matching byte
// we need to serve up the buffered bytes before we go back to NOT_MATCHED
next=buf[unbufferIndex++];
if(unbufferIndex==matchedIndex) {
state=State.NOT_MATCHED;
matchedIndex=0;
}
return next;
default:
throw new IllegalStateException("no such state " + state);
}
}
@Override
public String toString() {
return state.name() + " " + matchedIndex + " " + replacedIndex + " " + unbufferIndex;
}
}
import java.io.IOException;
import java.io.InputStream;
public class TokenReplacingStream extends InputStream {
private final InputStream source;
private final byte[] oldBytes;
private final byte[] newBytes;
private int tokenMatchIndex = 0;
private int bytesIndex = 0;
private boolean unwinding;
private int mismatch;
private int numberOfTokensReplaced = 0;
public TokenReplacingStream(InputStream source, byte[] oldBytes, byte[] newBytes) {
assert oldBytes.length > 0;
this.source = source;
this.oldBytes = oldBytes;
this.newBytes = newBytes;
}
@Override
public int read() throws IOException {
if (unwinding) {
if (bytesIndex < tokenMatchIndex) {
return oldBytes[bytesIndex++];
} else {
bytesIndex = 0;
tokenMatchIndex = 0;
unwinding = false;
return mismatch;
}
} else if (tokenMatchIndex == oldBytes.length) {
if (bytesIndex == newBytes.length) {
bytesIndex = 0;
tokenMatchIndex = 0;
numberOfTokensReplaced++;
} else {
return newBytes[bytesIndex++];
}
}
int b = source.read();
if (b == oldBytes[tokenMatchIndex]) {
tokenMatchIndex++;
} else if (tokenMatchIndex > 0) {
mismatch = b;
unwinding = true;
} else {
return b;
}
return read();
}
@Override
public void close() throws IOException {
source.close();
}
public int getNumberOfTokensReplaced() {
return numberOfTokensReplaced;
}
}
public static void replaceStream(InputStream in, OutputStream out, String search, String replace) throws IOException
{
replaceStream(new InputStreamReader(in), new OutputStreamWriter(out), search, replace);
}
public static void replaceStream(Reader in, Writer out, String search, String replace) throws IOException
{
char[] searchChars = search.toCharArray();
int[] buffer = new int[searchChars.length];
int x, r, si = 0, sm = searchChars.length;
while ((r = in.read()) > 0) {
if (searchChars[si] == r) {
// The char matches our pattern
buffer[si++] = r;
if (si == sm) {
// We have reached a matching string
out.write(replace);
si = 0;
}
} else if (si > 0) {
// No match and buffered char(s), empty buffer and pass the char forward
for (x = 0; x < si; x++) {
out.write(buffer[x]);
}
si = 0;
out.write(r);
} else {
// No match and nothing buffered, just pass the char forward
out.write(r);
}
}
// Empty buffer
for (x = 0; x < si; x++) {
out.write(buffer[x]);
}
}