C# 如何用C语言中的迭代器反向读取文本文件#
我需要处理一个大文件,大约400K行和200m。但有时我必须自下而上处理。在这里如何使用迭代器(yield-return)?基本上我不喜欢把所有的东西都放在内存中。我知道在.NET中使用迭代器更有效。要创建文件迭代器,可以执行以下操作: 编辑: 这是我的固定宽度反向文件读取器的固定版本:C# 如何用C语言中的迭代器反向读取文本文件#,c#,.net,C#,.net,我需要处理一个大文件,大约400K行和200m。但有时我必须自下而上处理。在这里如何使用迭代器(yield-return)?基本上我不喜欢把所有的东西都放在内存中。我知道在.NET中使用迭代器更有效。要创建文件迭代器,可以执行以下操作: 编辑: 这是我的固定宽度反向文件读取器的固定版本: public static IEnumerable<string> readFile() { using (FileStream reader = new FileStream(@"c:\t
public static IEnumerable<string> readFile()
{
using (FileStream reader = new FileStream(@"c:\test.txt",FileMode.Open,FileAccess.Read))
{
int i=0;
StringBuilder lineBuffer = new StringBuilder();
int byteRead;
while (-i < reader.Length)
{
reader.Seek(--i, SeekOrigin.End);
byteRead = reader.ReadByte();
if (byteRead == 10 && lineBuffer.Length > 0)
{
yield return Reverse(lineBuffer.ToString());
lineBuffer.Remove(0, lineBuffer.Length);
}
lineBuffer.Append((char)byteRead);
}
yield return Reverse(lineBuffer.ToString());
reader.Close();
}
}
public static string Reverse(string str)
{
char[] arr = new char[str.Length];
for (int i = 0; i < str.Length; i++)
arr[i] = str[str.Length - 1 - i];
return new string(arr);
}
公共静态IEnumerable readFile()
{
使用(FileStream reader=newfilestream(@“c:\test.txt”、FileMode.Open、FileAccess.Read))
{
int i=0;
StringBuilder lineBuffer=新的StringBuilder();
内特比德;
而(-i0)
{
收益返回反向(lineBuffer.ToString());
lineBuffer.Remove(0,lineBuffer.Length);
}
lineBuffer.Append((char)byteRead);
}
收益返回反向(lineBuffer.ToString());
reader.Close();
}
}
公共静态字符串反转(字符串str)
{
char[]arr=新字符[str.Length];
对于(int i=0;i
除非使用固定大小的编码(如ASCII),否则向后读取文本文件非常棘手。当你得到了可变大小的编码(比如UTF-8)时,当你获取数据时,你将不得不检查你是否在一个字符的中间。
框架中没有内置任何内容,我怀疑您必须为每个可变宽度编码分别进行硬编码
编辑:这已经过一些测试——但这并不是说它周围没有一些细微的bug。它使用MiscUtil中的StreamUtil,但我在底部只包含了必要的(新)方法。哦,它需要重构——有一个非常强大的方法,你会看到:
使用系统;
使用系统集合;
使用System.Collections.Generic;
使用System.IO;
使用系统文本;
命名空间MiscUtil.IO
{
///
///采用编码(默认为UTF-8)和生成可查找流的函数
///(或为方便起见使用文件名)并从流的末尾向后生成行。
///仅支持单字节编码、UTF-8和Unicode
///函数返回的值必须是可查找的。
///
公共密封类反向读取器:IEnumerable
{
///
///默认情况下使用的缓冲区大小。具有内部访问权限的类可以指定
///不同的缓冲区大小-这对于测试很有用。
///
private const int DefaultBufferSize=4096;
///
///创建要从中读取的流的方法。
///
私有只读Func streamSource;
///
///将字节转换为文本时要使用的编码
///
私有只读编码;
///
///每次从中读取时要读取的缓冲区大小(以字节为单位)
///流。这必须至少与流的最大数量相同
///单个字符的字节数。
///
私有只读int-bufferSize;
///
///函数,当给定文件中的一个位置和一个字节时,该函数说明
///字节是否表示字符的开头。
///
私有函数characterStartDetector;
///
///从流源创建LineReader。委托仅为
///获取枚举数时调用。UTF-8用于解码
///将流转换为文本。
///
///数据源
公共反向读卡器(Func streamSource)
:此(streamSource,Encoding.UTF8)
{
}
///
///从文件名创建行读取器。仅打开该文件
///获取枚举数时(甚至检查是否存在)。
///UTF8用于将文件解码为文本。
///
///要从中读取的文件
公共ReverseLineReader(字符串文件名)
:此(文件名,Encoding.UTF8)
{
}
///
///从文件名创建行读取器。仅打开该文件
///获取枚举数时(甚至检查是否存在)。
///
///要从中读取的文件
///用于将文件解码为文本的编码
公共ReverseLineReader(字符串文件名,编码)
:此(()=>File.OpenRead(文件名),编码)
{
}
///
///从流源创建LineReader。委托仅为
///获取枚举数时调用。
///
///数据源
///用于将流解码为文本的编码
公共ReverseLineReader(Func streamSource,编码)
:此(streamSource、编码、DefaultBufferSize)
{
}
内部ReverseLineReader(Func streamSource、编码、int bufferSize)
{
this.streamSource=streamSource;
this.encoding=编码;
this.bufferSize=bufferSize;
if(编码.IsSingleByte)
{
//对于单字节编码,每个字节都是字符的开始(和结束)
characterStartDetector=(位置、数据)=>true;
}
else if(编码为Unicode编码)
{
//对于UTF-16,偶数编号的位置是字符的开头。
//TODO:这假定没有代理项对。需要更多的工作
//来处理这件事。
characterStartDetector=(位置,数据)=>(位置&1)==0;
}
else if(编码为UTF8Encoding)
{
//对于UTF
StreamReader objReader = new StreamReader(filename);
string sLine = "";
ArrayList arrText = new ArrayList();
while (sLine != null)
{
sLine = objReader.ReadLine();
if (sLine != null)
arrText.Add(sLine);
}
objReader.Close();
arrText.Reverse();
foreach (string sOutput in arrText)
{
foreach (var line in File.ReadLines(@"C:\temp\ReverseRead.txt").Reverse())
{
if (noNeedToReadFurther)
break;
// process line here
Console.WriteLine(line);
}
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace BigFileService
{
public class BigFileDumper
{
/// <summary>
/// Buffer that will store the lines until it is full.
/// Then it will dump it to temp files.
/// </summary>
public int CHUNK_SIZE = 1000;
public bool ReverseIt { get; set; }
public long TotalLineCount { get { return totalLineCount; } }
private long totalLineCount;
private int BufferCount = 0;
private StreamWriter Writer;
/// <summary>
/// List of files that would store the chunks.
/// </summary>
private List<string> LstTempFiles;
private string ParentDirectory;
private char[] trimchars = { '/', '\\'};
public BigFileDumper(string FolderPathToWrite)
{
this.LstTempFiles = new List<string>();
this.ParentDirectory = FolderPathToWrite.TrimEnd(trimchars) + "\\" + "BIG_FILE_DUMP";
this.totalLineCount = 0;
this.BufferCount = 0;
this.Initialize();
}
private void Initialize()
{
// Delete existing directory.
if (Directory.Exists(this.ParentDirectory))
{
Directory.Delete(this.ParentDirectory, true);
}
// Create a new directory.
Directory.CreateDirectory(this.ParentDirectory);
}
public void WriteLine(string line)
{
if (this.BufferCount == 0)
{
string newFile = "DumpFile_" + LstTempFiles.Count();
LstTempFiles.Add(newFile);
Writer = new StreamWriter(this.ParentDirectory + "\\" + newFile);
}
// Keep on adding in the buffer as long as size is okay.
if (this.BufferCount < this.CHUNK_SIZE)
{
this.totalLineCount++; // main count
this.BufferCount++; // Chunk count.
Writer.WriteLine(line);
}
else
{
// Buffer is full, time to create a new file.
// Close the existing file first.
Writer.Close();
// Make buffer count 0 again.
this.BufferCount = 0;
this.WriteLine(line);
}
}
public void Close()
{
if (Writer != null)
Writer.Close();
}
public string GetFullFile()
{
if (LstTempFiles.Count <= 0)
{
Debug.Assert(false, "There are no files created.");
return "";
}
string returnFilename = this.ParentDirectory + "\\" + "FullFile";
if (File.Exists(returnFilename) == false)
{
// Create a consolidated file from the existing small dump files.
// Now this is interesting. We will open the small dump files one by one.
// Depending on whether the user require inverted file, we will read them in descending order & reverted,
// or ascending order in normal way.
if (this.ReverseIt)
this.LstTempFiles.Reverse();
foreach (var fileName in LstTempFiles)
{
string fullFileName = this.ParentDirectory + "\\" + fileName;
// FileLines will use small memory depending on size of CHUNK. User has control.
var fileLines = File.ReadAllLines(fullFileName);
// Time to write in the writer.
if (this.ReverseIt)
fileLines = fileLines.Reverse().ToArray();
// Write the lines
File.AppendAllLines(returnFilename, fileLines);
}
}
return returnFilename;
}
}
}
void TestBigFileDump_File(string BIG_FILE, string FOLDER_PATH_FOR_CHUNK_FILES)
{
// Start processing the input Big file.
StreamReader reader = new StreamReader(BIG_FILE);
// Create a dump file class object to handle efficient memory management.
var bigFileDumper = new BigFileDumper(FOLDER_PATH_FOR_CHUNK_FILES);
// Set to reverse the output file.
bigFileDumper.ReverseIt = true;
bigFileDumper.CHUNK_SIZE = 100; // How much at a time to keep in RAM before dumping to local file.
while (reader.EndOfStream == false)
{
string line = reader.ReadLine();
bigFileDumper.WriteLine(line);
}
bigFileDumper.Close();
reader.Close();
// Get back full reversed file.
var reversedFilename = bigFileDumper.GetFullFile();
Console.WriteLine("Check output file - " + reversedFilename);
}
var reader = new ReverseTextReader(@"C:\Temp\ReverseTest.txt");
while (!reader.EndOfStream)
Console.WriteLine(reader.ReadLine());
/// <summary>
/// Reads a text file backwards, line-by-line.
/// </summary>
/// <remarks>This class uses file seeking to read a text file of any size in reverse order. This
/// is useful for needs such as reading a log file newest-entries first.</remarks>
public sealed class ReverseTextReader : IEnumerable<string>
{
private const int BufferSize = 16384; // The number of bytes read from the uderlying stream.
private readonly Stream _stream; // Stores the stream feeding data into this reader
private readonly Encoding _encoding; // Stores the encoding used to process the file
private byte[] _leftoverBuffer; // Stores the leftover partial line after processing a buffer
private readonly Queue<string> _lines; // Stores the lines parsed from the buffer
#region Constructors
/// <summary>
/// Creates a reader for the specified file.
/// </summary>
/// <param name="filePath"></param>
public ReverseTextReader(string filePath)
: this(new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read), Encoding.Default)
{ }
/// <summary>
/// Creates a reader using the specified stream.
/// </summary>
/// <param name="stream"></param>
public ReverseTextReader(Stream stream)
: this(stream, Encoding.Default)
{ }
/// <summary>
/// Creates a reader using the specified path and encoding.
/// </summary>
/// <param name="filePath"></param>
/// <param name="encoding"></param>
public ReverseTextReader(string filePath, Encoding encoding)
: this(new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read), encoding)
{ }
/// <summary>
/// Creates a reader using the specified stream and encoding.
/// </summary>
/// <param name="stream"></param>
/// <param name="encoding"></param>
public ReverseTextReader(Stream stream, Encoding encoding)
{
_stream = stream;
_encoding = encoding;
_lines = new Queue<string>(128);
// The stream needs to support seeking for this to work
if(!_stream.CanSeek)
throw new InvalidOperationException("The specified stream needs to support seeking to be read backwards.");
if (!_stream.CanRead)
throw new InvalidOperationException("The specified stream needs to support reading to be read backwards.");
// Set the current position to the end of the file
_stream.Position = _stream.Length;
_leftoverBuffer = new byte[0];
}
#endregion
#region Overrides
/// <summary>
/// Reads the next previous line from the underlying stream.
/// </summary>
/// <returns></returns>
public string ReadLine()
{
// Are there lines left to read? If so, return the next one
if (_lines.Count != 0) return _lines.Dequeue();
// Are we at the beginning of the stream? If so, we're done
if (_stream.Position == 0) return null;
#region Read and Process the Next Chunk
// Remember the current position
var currentPosition = _stream.Position;
var newPosition = currentPosition - BufferSize;
// Are we before the beginning of the stream?
if (newPosition < 0) newPosition = 0;
// Calculate the buffer size to read
var count = (int)(currentPosition - newPosition);
// Set the new position
_stream.Position = newPosition;
// Make a new buffer but append the previous leftovers
var buffer = new byte[count + _leftoverBuffer.Length];
// Read the next buffer
_stream.Read(buffer, 0, count);
// Move the position of the stream back
_stream.Position = newPosition;
// And copy in the leftovers from the last buffer
if (_leftoverBuffer.Length != 0)
Array.Copy(_leftoverBuffer, 0, buffer, count, _leftoverBuffer.Length);
// Look for CrLf delimiters
var end = buffer.Length - 1;
var start = buffer.Length - 2;
// Search backwards for a line feed
while (start >= 0)
{
// Is it a line feed?
if (buffer[start] == 10)
{
// Yes. Extract a line and queue it (but exclude the \r\n)
_lines.Enqueue(_encoding.GetString(buffer, start + 1, end - start - 2));
// And reset the end
end = start;
}
// Move to the previous character
start--;
}
// What's left over is a portion of a line. Save it for later.
_leftoverBuffer = new byte[end + 1];
Array.Copy(buffer, 0, _leftoverBuffer, 0, end + 1);
// Are we at the beginning of the stream?
if (_stream.Position == 0)
// Yes. Add the last line.
_lines.Enqueue(_encoding.GetString(_leftoverBuffer, 0, end - 1));
#endregion
// If we have something in the queue, return it
return _lines.Count == 0 ? null : _lines.Dequeue();
}
#endregion
#region IEnumerator<string> Interface
public IEnumerator<string> GetEnumerator()
{
string line;
// So long as the next line isn't null...
while ((line = ReadLine()) != null)
// Read and return it.
yield return line;
}
IEnumerator IEnumerable.GetEnumerator()
{
throw new NotImplementedException();
}
#endregion
}
//*********************************************************************************************************************************
//
// Class: BackwardReader
// Initial Date: 11/29/2010
// Last Modified: 11/29/2010
// Programmer(s): Original C# Source - the_real_herminator
// http://social.msdn.microsoft.com/forums/en-US/csharpgeneral/thread/9acdde1a-03cd-4018-9f87-6e201d8f5d09
// VB Converstion - Blake Pell
//
//*********************************************************************************************************************************
using System.Text;
using System.IO;
public class BackwardReader
{
private string path;
private FileStream fs = null;
public BackwardReader(string path)
{
this.path = path;
fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
fs.Seek(0, SeekOrigin.End);
}
public string Readline()
{
byte[] line;
byte[] text = new byte[1];
long position = 0;
int count;
fs.Seek(0, SeekOrigin.Current);
position = fs.Position;
//do we have trailing rn?
if (fs.Length > 1)
{
byte[] vagnretur = new byte[2];
fs.Seek(-2, SeekOrigin.Current);
fs.Read(vagnretur, 0, 2);
if (ASCIIEncoding.ASCII.GetString(vagnretur).Equals("rn"))
{
//move it back
fs.Seek(-2, SeekOrigin.Current);
position = fs.Position;
}
}
while (fs.Position > 0)
{
text.Initialize();
//read one char
fs.Read(text, 0, 1);
string asciiText = ASCIIEncoding.ASCII.GetString(text);
//moveback to the charachter before
fs.Seek(-2, SeekOrigin.Current);
if (asciiText.Equals("n"))
{
fs.Read(text, 0, 1);
asciiText = ASCIIEncoding.ASCII.GetString(text);
if (asciiText.Equals("r"))
{
fs.Seek(1, SeekOrigin.Current);
break;
}
}
}
count = int.Parse((position - fs.Position).ToString());
line = new byte[count];
fs.Read(line, 0, count);
fs.Seek(-count, SeekOrigin.Current);
return ASCIIEncoding.ASCII.GetString(line);
}
public bool SOF
{
get
{
return fs.Position == 0;
}
}
public void Close()
{
fs.Close();
}
}
[System.IO.FileStream]$fileStream = [System.IO.File]::Open("C:\Name_of_very_large_file.log", [System.IO.FileMode]::Open, [System.IO.FileAccess]::Read, [System.IO.FileShare]::ReadWrite)
[System.IO.BufferedStream]$bs = New-Object System.IO.BufferedStream $fileStream;
[System.IO.StreamReader]$sr = New-Object System.IO.StreamReader $bs;
$buff = New-Object char[] 20;
$seek = $bs.Seek($fileStream.Length - 10000, [System.IO.SeekOrigin]::Begin);
while(($line = $sr.ReadLine()) -ne $null)
{
$line;
}
using System.Management.Automation;
const string FILE_PATH = @"d:\temp\b_media_27_34_0000_25393.txt";
var ps = PowerShell.Create();
ps.AddCommand("Get-Content")
.AddParameter("Path", FILE_PATH)
.AddParameter("Tail", 1);
var psResults = ps.Invoke();
var lastLine = psResults.FirstOrDefault()?.BaseObject.ToString();
ps.Dispose();
var reader = new ReverseTextReader(path);
while (!reader.EndOfStream)
{
Console.WriteLine(reader.ReadLine());
}
public class ReverseTextReader
{
private const int LineFeedLf = 10;
private const int LineFeedCr = 13;
private readonly Stream _stream;
private readonly Encoding _encoding;
public bool EndOfStream => _stream.Position == 0;
public ReverseTextReader(Stream stream, Encoding encoding)
{
_stream = stream;
_encoding = encoding;
_stream.Position = _stream.Length;
}
public string ReadLine()
{
if (_stream.Position == 0) return null;
var line = new List<byte>();
var endOfLine = false;
while (!endOfLine)
{
var b = _stream.ReadByteFromBehind();
if (b == -1 || b == LineFeedLf)
{
endOfLine = true;
}
line.Add(Convert.ToByte(b));
}
line.Reverse();
return _encoding.GetString(line.ToArray());
}
}
public static class StreamExtensions
{
public static int ReadByteFromBehind(this Stream stream)
{
if (stream.Position == 0) return -1;
stream.Position = stream.Position - 1;
var value = stream.ReadByte();
stream.Position = stream.Position - 1;
return value;
}
}