I have implemented a CSV reader. I think I did pretty well. Since CSV is a loosely defined format to begin with I decided to allow some malformations, like anything but a delimiter after an enclosed value.
Maybe someone could point out improvements to this class, I would be happy to know them.
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
namespace ConsoleApplication49
{
public class CsvReader
{
private const char Sym_Escape = '"';
private static int StandardInitialRowSize = 16;
private StreamReader reader;
private char delimiter;
private char[] buffer;
private int bufferSize;
private int bufferBound;
private int bufferPos;
private bool endReached;
private bool boundReached;
private bool returnImplicitRow;
private int initialRowSize;
private int valuePos;
private StringBuilder valueBuilder;
public CsvReader(Stream stream, char delimiter = ',', int bufferSize = 4096)
{
#region check
if (stream == null)
{
throw new ArgumentNullException("stream");
}
if (delimiter == Sym_Escape || delimiter == '\r')
{
throw new ArgumentException("Invalid delimiter", "delimiter");
}
if (bufferSize < 128)
{
throw new ArgumentException("Invalid buffer size", "bufferSize");
}
#endregion
this.reader = new StreamReader(stream, Encoding.UTF8, true, bufferSize);
this.delimiter = delimiter;
this.buffer = new char[bufferSize];
this.bufferSize = bufferSize;
this.initialRowSize = StandardInitialRowSize;
this.valueBuilder = new StringBuilder(128);
if (reader.BaseStream.Length == 0)
{
returnImplicitRow = true;
}
}
public bool Read(out string[] outRow)
{
Assert();
if (endReached)
{
if (returnImplicitRow)
{
returnImplicitRow = false;
outRow = new string[1];
return true;
}
else
{
outRow = null;
return false;
}
}
string[] row = new string[initialRowSize];
int rowSize = initialRowSize;
int rowPos = 0;
bool newlineReached = false;
do
{
Assert();
if (endReached)
{
goto SetValue;
}
char chr = buffer[bufferPos++];
if (chr == Sym_Escape)
{
Assert();
if (endReached)
{
goto SetValue;
}
valuePos = bufferPos;
chr = buffer[bufferPos++];
while (true)
{
#region Regular assertion
if (bufferPos == bufferBound)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
if (reader.EndOfStream)
{
endReached = true;
}
else
{
bufferBound = reader.Read(buffer, 0, bufferSize);
bufferPos = 0;
valuePos = 0;
}
boundReached = true;
}
else
{
boundReached = false;
}
#endregion
if (chr == Sym_Escape)
{
if (endReached)
{
goto SetValue;
}
chr = buffer[bufferPos];
if (chr == Sym_Escape)
{
if (boundReached)
{
valueBuilder.Append(Sym_Escape);
}
else
{
valueBuilder.Append(buffer, valuePos, bufferPos - valuePos);
}
bufferPos++;
valuePos = bufferPos;
Assert();
}
else
{
if (!boundReached)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
valuePos = bufferPos;
}
bufferPos++;
break;
}
}
else if (boundReached)
{
valueBuilder.Append(chr);
}
if (endReached)
{
goto SetValue;
}
chr = buffer[bufferPos++];
}
}
while (true)
{
#region Regular assertion
if (bufferPos == bufferBound)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
if (reader.EndOfStream)
{
endReached = true;
}
else
{
bufferBound = reader.Read(buffer, 0, bufferSize);
bufferPos = 0;
valuePos = 0;
}
boundReached = true;
}
else
{
boundReached = false;
}
#endregion
if (chr == delimiter)
{
if (!boundReached)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
valuePos = bufferPos;
}
endReached = false;
break;
}
else if (chr == '\r' && !endReached && buffer[bufferPos] == '\n')
{
if (!boundReached)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
}
bufferPos++;
valuePos = bufferPos;
Assert();
if (endReached)
{
returnImplicitRow = true;
}
newlineReached = true;
break;
}
else if (boundReached)
{
valueBuilder.Append(chr);
}
if (endReached)
{
break;
}
chr = buffer[bufferPos++];
}
SetValue:
if (rowPos == rowSize)
{
rowSize *= 2;
Array.Resize(ref row, rowSize);
}
row[rowPos++] = valueBuilder.ToString();
valueBuilder.Length = 0;
}
while (!endReached && !newlineReached);
if (rowPos < rowSize)
{
Array.Resize(ref row, rowPos);
}
outRow = row;
initialRowSize = rowPos;
return true;
}
private void Assert()
{
if (bufferPos == bufferBound)
{
if (reader.EndOfStream)
{
endReached = true;
}
else
{
bufferBound = reader.Read(buffer, 0, bufferSize);
bufferPos = 0;
valuePos = 0;
}
boundReached = true;
}
else
{
boundReached = false;
}
}
}
}
Here is some code to do testing
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
namespace ConsoleApplication49
{
class Program
{
static void Main(string[] args)
{
CsvReader csvReader = new CsvReader(OpenFile(@"D:\Users\Administrator\desktop\test.csv"));
string[] row;
while (csvReader.Read(out row))
{
int len = row.Length - 1;
for (int i = 0; i <= len; i++)
{
Console.Write(Filter(row[i], Char.IsControl));
if (i < len)
{
Console.Write('|');
}
}
Console.WriteLine();
}
Console.ReadLine();
}
public static string Filter(string str, Func<char, bool> invalidator)
{
StringBuilder sb = new StringBuilder();
foreach (char c in str)
{
if (!invalidator.Invoke(c))
{
sb.Append(c);
}
}
return sb.ToString();
}
public static FileStream OpenFile(string filePath)
{
return OpenFile(filePath, FileAccess.ReadWrite, FileShare.None);
}
public static FileStream OpenFile(string filePath, FileAccess fileAccess, FileShare fileShare)
{
FileStream fs = null;
try
{
fs = File.Open(filePath, FileMode.Open, fileAccess, fileShare);
}
catch (Exception) { }
return fs;
}
}
}
goto, and the fact that you havent disposed your stream. But they seem to have not noticed yet that you also havecatch(Exception) { }, which is arguably worse. Let's catch and continue when we get anOutOfMemoryException! Those aren't important! Yeah, nice plan! \$\endgroup\$gotoare also controversial. This question feels like it is trolling. Please remove the hyperbole, or give examples of where your code is more complete than other libraries, more performant, etc. I will be happy to go though and edit this myself in an hour or so. \$\endgroup\$