[c#] C #에서 반복기를 사용하여 텍스트 파일을 반대로 읽는 방법

Question 1

약 400K 라인과 200M 정도의 대용량 파일을 처리해야합니다.하지만 때로는 상향식으로 처리해야합니다. 여기서 반복자 (수익률)를 어떻게 사용할 수 있습니까? 기본적으로 모든 것을 메모리에로드하는 것을 좋아하지 않습니다. .NET에서 반복자를 사용하는 것이 더 효율적이라는 것을 알고 있습니다.

Question 2

고정 크기 인코딩 (예 : ASCII)을 사용하지 않는 한 텍스트 파일을 역방향으로 읽는 것은 정말 까다 롭습니다. 가변 크기 인코딩 (예 : UTF-8)을 사용하는 경우 데이터를 가져올 때 문자 중간에 있는지 여부를 계속 확인해야합니다.

프레임 워크에는 아무것도 내장되어 있지 않으며 각 가변 너비 인코딩에 대해 별도의 하드 코딩을해야한다고 생각합니다.

편집 : 이것은 다소 테스트되었지만 여전히 약간의 미묘한 버그가 없다고 말하는 것은 아닙니다. MiscUtil의 StreamUtil을 사용하지만 하단에 필요한 (새로운) 방법 만 포함했습니다. 아, 그리고 리팩토링이 필요합니다. 보시다시피 꽤 무거운 방법이 하나 있습니다.

using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace MiscUtil.IO
{
    /// <summary>
    /// Takes an encoding (defaulting to UTF-8) and a function which produces a seekable stream
    /// (or a filename for convenience) and yields lines from the end of the stream backwards.
    /// Only single byte encodings, and UTF-8 and Unicode, are supported. The stream
    /// returned by the function must be seekable.
    /// </summary>
    public sealed class ReverseLineReader : IEnumerable<string>
    {
        /// <summary>
        /// Buffer size to use by default. Classes with internal access can specify
        /// a different buffer size - this is useful for testing.
        /// </summary>
        private const int DefaultBufferSize = 4096;

        /// <summary>
        /// Means of creating a Stream to read from.
        /// </summary>
        private readonly Func<Stream> streamSource;

        /// <summary>
        /// Encoding to use when converting bytes to text
        /// </summary>
        private readonly Encoding encoding;

        /// <summary>
        /// Size of buffer (in bytes) to read each time we read from the
        /// stream. This must be at least as big as the maximum number of
        /// bytes for a single character.
        /// </summary>
        private readonly int bufferSize;

        /// <summary>
        /// Function which, when given a position within a file and a byte, states whether
        /// or not the byte represents the start of a character.
        /// </summary>
        private Func<long,byte,bool> characterStartDetector;

        /// <summary>
        /// Creates a LineReader from a stream source. The delegate is only
        /// called when the enumerator is fetched. UTF-8 is used to decode
        /// the stream into text.
        /// </summary>
        /// <param name="streamSource">Data source</param>
        public ReverseLineReader(Func<Stream> streamSource)
            : this(streamSource, Encoding.UTF8)
        {
        }

        /// <summary>
        /// Creates a LineReader from a filename. The file is only opened
        /// (or even checked for existence) when the enumerator is fetched.
        /// UTF8 is used to decode the file into text.
        /// </summary>
        /// <param name="filename">File to read from</param>
        public ReverseLineReader(string filename)
            : this(filename, Encoding.UTF8)
        {
        }

        /// <summary>
        /// Creates a LineReader from a filename. The file is only opened
        /// (or even checked for existence) when the enumerator is fetched.
        /// </summary>
        /// <param name="filename">File to read from</param>
        /// <param name="encoding">Encoding to use to decode the file into text</param>
        public ReverseLineReader(string filename, Encoding encoding)
            : this(() => File.OpenRead(filename), encoding)
        {
        }

        /// <summary>
        /// Creates a LineReader from a stream source. The delegate is only
        /// called when the enumerator is fetched.
        /// </summary>
        /// <param name="streamSource">Data source</param>
        /// <param name="encoding">Encoding to use to decode the stream into text</param>
        public ReverseLineReader(Func<Stream> streamSource, Encoding encoding)
            : this(streamSource, encoding, DefaultBufferSize)
        {
        }

        internal ReverseLineReader(Func<Stream> streamSource, Encoding encoding, int bufferSize)
        {
            this.streamSource = streamSource;
            this.encoding = encoding;
            this.bufferSize = bufferSize;
            if (encoding.IsSingleByte)
            {
                // For a single byte encoding, every byte is the start (and end) of a character
                characterStartDetector = (pos, data) => true;
            }
            else if (encoding is UnicodeEncoding)
            {
                // For UTF-16, even-numbered positions are the start of a character.
                // TODO: This assumes no surrogate pairs. More work required
                // to handle that.
                characterStartDetector = (pos, data) => (pos & 1) == 0;
            }
            else if (encoding is UTF8Encoding)
            {
                // For UTF-8, bytes with the top bit clear or the second bit set are the start of a character
                // See http://www.cl.cam.ac.uk/~mgk25/unicode.html
                characterStartDetector = (pos, data) => (data & 0x80) == 0 || (data & 0x40) != 0;
            }
            else
            {
                throw new ArgumentException("Only single byte, UTF-8 and Unicode encodings are permitted");
            }
        }

        /// <summary>
        /// Returns the enumerator reading strings backwards. If this method discovers that
        /// the returned stream is either unreadable or unseekable, a NotSupportedException is thrown.
        /// </summary>
        public IEnumerator<string> GetEnumerator()
        {
            Stream stream = streamSource();
            if (!stream.CanSeek)
            {
                stream.Dispose();
                throw new NotSupportedException("Unable to seek within stream");
            }
            if (!stream.CanRead)
            {
                stream.Dispose();
                throw new NotSupportedException("Unable to read within stream");
            }
            return GetEnumeratorImpl(stream);
        }

        private IEnumerator<string> GetEnumeratorImpl(Stream stream)
        {
            try
            {
                long position = stream.Length;

                if (encoding is UnicodeEncoding && (position & 1) != 0)
                {
                    throw new InvalidDataException("UTF-16 encoding provided, but stream has odd length.");
                }

                // Allow up to two bytes for data from the start of the previous
                // read which didn't quite make it as full characters
                byte[] buffer = new byte[bufferSize + 2];
                char[] charBuffer = new char[encoding.GetMaxCharCount(buffer.Length)];
                int leftOverData = 0;
                String previousEnd = null;
                // TextReader doesn't return an empty string if there's line break at the end
                // of the data. Therefore we don't return an empty string if it's our *first*
                // return.
                bool firstYield = true;

                // A line-feed at the start of the previous buffer means we need to swallow
                // the carriage-return at the end of this buffer - hence this needs declaring
                // way up here!
                bool swallowCarriageReturn = false;

                while (position > 0)
                {
                    int bytesToRead = Math.Min(position > int.MaxValue ? bufferSize : (int)position, bufferSize);

                    position -= bytesToRead;
                    stream.Position = position;
                    StreamUtil.ReadExactly(stream, buffer, bytesToRead);
                    // If we haven't read a full buffer, but we had bytes left
                    // over from before, copy them to the end of the buffer
                    if (leftOverData > 0 && bytesToRead != bufferSize)
                    {
                        // Buffer.BlockCopy doesn't document its behaviour with respect
                        // to overlapping data: we *might* just have read 7 bytes instead of
                        // 8, and have two bytes to copy...
                        Array.Copy(buffer, bufferSize, buffer, bytesToRead, leftOverData);
                    }
                    // We've now *effectively* read this much data.
                    bytesToRead += leftOverData;

                    int firstCharPosition = 0;
                    while (!characterStartDetector(position + firstCharPosition, buffer[firstCharPosition]))
                    {
                        firstCharPosition++;
                        // Bad UTF-8 sequences could trigger this. For UTF-8 we should always
                        // see a valid character start in every 3 bytes, and if this is the start of the file
                        // so we've done a short read, we should have the character start
                        // somewhere in the usable buffer.
                        if (firstCharPosition == 3 || firstCharPosition == bytesToRead)
                        {
                            throw new InvalidDataException("Invalid UTF-8 data");
                        }
                    }
                    leftOverData = firstCharPosition;

                    int charsRead = encoding.GetChars(buffer, firstCharPosition, bytesToRead - firstCharPosition, charBuffer, 0);
                    int endExclusive = charsRead;

                    for (int i = charsRead - 1; i >= 0; i--)
                    {
                        char lookingAt = charBuffer[i];
                        if (swallowCarriageReturn)
                        {
                            swallowCarriageReturn = false;
                            if (lookingAt == '\r')
                            {
                                endExclusive--;
                                continue;
                            }
                        }
                        // Anything non-line-breaking, just keep looking backwards
                        if (lookingAt != '\n' && lookingAt != '\r')
                        {
                            continue;
                        }
                        // End of CRLF? Swallow the preceding CR
                        if (lookingAt == '\n')
                        {
                            swallowCarriageReturn = true;
                        }
                        int start = i + 1;
                        string bufferContents = new string(charBuffer, start, endExclusive - start);
                        endExclusive = i;
                        string stringToYield = previousEnd == null ? bufferContents : bufferContents + previousEnd;
                        if (!firstYield || stringToYield.Length != 0)
                        {
                            yield return stringToYield;
                        }
                        firstYield = false;
                        previousEnd = null;
                    }

                    previousEnd = endExclusive == 0 ? null : (new string(charBuffer, 0, endExclusive) + previousEnd);

                    // If we didn't decode the start of the array, put it at the end for next time
                    if (leftOverData != 0)
                    {
                        Buffer.BlockCopy(buffer, 0, buffer, bufferSize, leftOverData);
                    }
                }
                if (leftOverData != 0)
                {
                    // At the start of the final buffer, we had the end of another character.
                    throw new InvalidDataException("Invalid UTF-8 data at start of stream");
                }
                if (firstYield && string.IsNullOrEmpty(previousEnd))
                {
                    yield break;
                }
                yield return previousEnd ?? "";
            }
            finally
            {
                stream.Dispose();
            }
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }
    }
}


// StreamUtil.cs:
public static class StreamUtil
{
    public static void ReadExactly(Stream input, byte[] buffer, int bytesToRead)
    {
        int index = 0;
        while (index < bytesToRead)
        {
            int read = input.Read(buffer, index, bytesToRead - index);
            if (read == 0)
            {
                throw new EndOfStreamException
                    (String.Format("End of stream reached with {0} byte{1} left to read.",
                                   bytesToRead - index,
                                   bytesToRead - index == 1 ? "s" : ""));
            }
            index += read;
        }
    }
}

피드백을 환영합니다. 재미 있었어요 🙂

Question 3

주의 :이 접근 방식은 작동하지 않습니다 (편집에서 설명 됨).

File.ReadLines를 사용하여 줄 반복자를 얻을 수 있습니다.

foreach (var line in File.ReadLines(@"C:\temp\ReverseRead.txt").Reverse())
{
    if (noNeedToReadFurther)
        break;

    // process line here
    Console.WriteLine(line);
}

편집하다:

applejacks01 의 의견을 읽은 후 몇 가지 테스트를 실행 했는데 실제로 전체 파일을로드하는 것처럼 보입니다 .Reverse() .

40MB 파일의 첫 줄File.ReadLines() 을 인쇄 하는 데 사용 했습니다 . 콘솔 앱의 메모리 사용량은 5MB 였습니다. 그런 다음 동일한 파일의 마지막 줄 을 인쇄 하는 데 사용되었습니다 . 메모리 사용량은 95MB 였습니다.File.ReadLines().Reverse()

결론

`Reverse () ‘가 무엇을하든 큰 파일의 맨 아래를 읽는 데는 좋은 선택이 아닙니다 .

Question 4

파일 반복기를 만들려면 다음을 수행하십시오.

편집하다:

이것은 고정 너비 역방향 파일 판독기의 고정 버전입니다.

public static IEnumerable<string> readFile()
{
    using (FileStream reader = new FileStream(@"c:\test.txt",FileMode.Open,FileAccess.Read))
    {
        int i=0;
        StringBuilder lineBuffer = new StringBuilder();
        int byteRead;
        while (-i < reader.Length)
        {
            reader.Seek(--i, SeekOrigin.End);
            byteRead = reader.ReadByte();
            if (byteRead == 10 && lineBuffer.Length > 0)
            {
                yield return Reverse(lineBuffer.ToString());
                lineBuffer.Remove(0, lineBuffer.Length);
            }
            lineBuffer.Append((char)byteRead);
        }
        yield return Reverse(lineBuffer.ToString());
        reader.Close();
    }
}

public static string Reverse(string str)
{
    char[] arr = new char[str.Length];
    for (int i = 0; i < str.Length; i++)
        arr[i] = str[str.Length - 1 - i];
    return new string(arr);
}

Question 5

파일을 한 줄씩 목록에 넣은 다음 List.Reverse ();

        StreamReader objReader = new StreamReader(filename);
        string sLine = "";
        ArrayList arrText = new ArrayList();

        while (sLine != null)
        {
            sLine = objReader.ReadLine();
            if (sLine != null)
                arrText.Add(sLine);
        }
        objReader.Close();


        arrText.Reverse();

        foreach (string sOutput in arrText)
        {

…

Question 6

한 번에 한 문자 씩 뒤로 파일을 읽고 캐리지 리턴 및 / 또는 줄 바꿈에 도달 할 때까지 모든 문자를 캐시 할 수 있습니다.

그런 다음 수집 된 문자열을 뒤집고 한 줄로 소리를 지 릅니다.

Question 7

이 게시물이 매우 오래되었다는 것을 알고 있지만 가장 많이 투표 한 솔루션을 사용하는 방법을 찾을 수 없었기 때문에 마침내 이것을 발견했습니다. 여기에 VB 및 C #에서 낮은 메모리 비용으로 찾은 최고의 답변이 있습니다.

http://www.blakepell.com/2010-11-29-backward-file-reader-vb-csharp-source

이 게시물을 찾는 데 몇 시간이 걸렸기 때문에 다른 사람들을 도와 드리겠습니다.

[편집하다]

다음은 C # 코드입니다.

//*********************************************************************************************************************************
//
//             Class:  BackwardReader
//      Initial Date:  11/29/2010
//     Last Modified:  11/29/2010
//     Programmer(s):  Original C# Source - the_real_herminator
//                     http://social.msdn.microsoft.com/forums/en-US/csharpgeneral/thread/9acdde1a-03cd-4018-9f87-6e201d8f5d09
//                     VB Converstion - Blake Pell
//
//*********************************************************************************************************************************

using System.Text;
using System.IO;
public class BackwardReader
{
    private string path;
    private FileStream fs = null;
    public BackwardReader(string path)
    {
        this.path = path;
        fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
        fs.Seek(0, SeekOrigin.End);
    }
    public string Readline()
    {
        byte[] line;
        byte[] text = new byte[1];
        long position = 0;
        int count;
        fs.Seek(0, SeekOrigin.Current);
        position = fs.Position;
        //do we have trailing rn?
        if (fs.Length > 1)
        {
            byte[] vagnretur = new byte[2];
            fs.Seek(-2, SeekOrigin.Current);
            fs.Read(vagnretur, 0, 2);
            if (ASCIIEncoding.ASCII.GetString(vagnretur).Equals("rn"))
            {
                //move it back
                fs.Seek(-2, SeekOrigin.Current);
                position = fs.Position;
            }
        }
        while (fs.Position > 0)
        {
            text.Initialize();
            //read one char
            fs.Read(text, 0, 1);
            string asciiText = ASCIIEncoding.ASCII.GetString(text);
            //moveback to the charachter before
            fs.Seek(-2, SeekOrigin.Current);
            if (asciiText.Equals("n"))
            {
                fs.Read(text, 0, 1);
                asciiText = ASCIIEncoding.ASCII.GetString(text);
                if (asciiText.Equals("r"))
                {
                    fs.Seek(1, SeekOrigin.Current);
                    break;
                }
            }
        }
        count = int.Parse((position - fs.Position).ToString());
        line = new byte[count];
        fs.Read(line, 0, count);
        fs.Seek(-count, SeekOrigin.Current);
        return ASCIIEncoding.ASCII.GetString(line);
    }
    public bool SOF
    {
        get
        {
            return fs.Position == 0;
        }
    }
    public void Close()
    {
        fs.Close();
    }
}

Question 8

대용량 파일을위한 매우 빠른 솔루션 . Tail 옵션과 함께 powershell Get-Content cmdlet을 사용합니다. powershell을 호출하면 약간의 오버 헤드가 발생하지만 대용량 파일의 경우 쓸모가 없습니다.

using System.Management.Automation;

const string FILE_PATH = @"d:\temp\b_media_27_34_0000_25393.txt";
var ps = PowerShell.Create();
ps.AddCommand("Get-Content")
    .AddParameter("Path", FILE_PATH)
    .AddParameter("Tail", 1);
var psResults = ps.Invoke();
var lastLine = psResults.FirstOrDefault()?.BaseObject.ToString();

ps.Dispose();

필수 powershell 참조

C : \ Program Files (x86) \ Reference Assemblies \ Microsoft \ WindowsPowerShell \ 3.0 \ System.Management.Automation.dll