Skip to main content
We’ve updated our Terms of Service. A new AI Addendum clarifies how Stack Overflow utilizes AI interactions.
deleted 18 characters in body
Source Link
Dmitry Nogin
  • 6.1k
  • 3
  • 21
  • 40

I would try to buffer the output and read files in parallel. You could try it this way:

class Program
{
    static void Main(string[] args)
    {
        var f = new MergedCsv(@"C:\a1");
        f.WriteTo(@"C:\MasterFileA1.csv");
    }
}

Try adjusting degreeOfParallelism and bufferSize in this helper class:

class MergedCsv : IEnumerable<string>
{
    public MergedCsv(string path, int degreeOfParallelism = 5, int bufferSize = 4096)
    {
        Path = path;
        DegreeOfParallelism = degreeOfParallelism;
        BufferSize = bufferSize;
    }

    public void WriteTo(string path) =>
        WriteTo(File.OpenWrite(path));

    public void WriteTo(Stream stream)
    {
        using (var buffer = new BufferedStream(stream, BufferSize))
        using (var writer = new StreamWriter(buffer))
            foreach (var line in this)
                writer.WriteLine(line);
    }

    public IEnumerator<string> GetEnumerator() => Header.Concat(Data).GetEnumerator();
    IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();

    public IEnumerable<string> Header => Files
        .Select(f => File.ReadLines(f).Take(1))
        .First();

    public IEnumerable<string> Data => Files
        .AsParallel()
        .WithDegreeOfParallelism(DegreeOfParallelism)
        .Select(f => File.ReadAllLines(f).Skip(1))
        .AsSequential() // Should you preserve source file line order?
        .SelectMany(f => f);

    IEnumerable<string> Files => 
        Directory.GetFiles(Path, "*.csv", SearchOption.TopDirectoryOnly); 

    string Path { get; }
    int DegreeOfParallelism { get; }
    int BufferSize { get; }
}

I would try to buffer the output and read files in parallel. You could try it this way:

class Program
{
    static void Main(string[] args)
    {
        var f = new MergedCsv(@"C:\a1");
        f.WriteTo(@"C:\MasterFileA1.csv");
    }
}

Try adjusting degreeOfParallelism and bufferSize in this helper class:

class MergedCsv : IEnumerable<string>
{
    public MergedCsv(string path, int degreeOfParallelism = 5, int bufferSize = 4096)
    {
        Path = path;
        DegreeOfParallelism = degreeOfParallelism;
        BufferSize = bufferSize;
    }

    public void WriteTo(string path) =>
        WriteTo(File.OpenWrite(path));

    public void WriteTo(Stream stream)
    {
        using (var buffer = new BufferedStream(stream, BufferSize))
        using (var writer = new StreamWriter(buffer))
            foreach (var line in this)
                writer.WriteLine(line);
    }

    public IEnumerator<string> GetEnumerator() => Header.Concat(Data).GetEnumerator();
    IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();

    public IEnumerable<string> Header => Files
        .Select(f => File.ReadLines(f).Take(1))
        .First();

    public IEnumerable<string> Data => Files
        .AsParallel()
        .WithDegreeOfParallelism(DegreeOfParallelism)
        .Select(f => File.ReadAllLines(f).Skip(1))
        .AsSequential() // Should you preserve source file line order?
        .SelectMany(f => f);

    IEnumerable<string> Files => Directory.GetFiles(Path, "*.csv", SearchOption.TopDirectoryOnly);
    string Path { get; }
    int DegreeOfParallelism { get; }
    int BufferSize { get; }
}

I would buffer the output and read files in parallel:

class Program
{
    static void Main(string[] args)
    {
        var f = new MergedCsv(@"C:\a1");
        f.WriteTo(@"C:\MasterFileA1.csv");
    }
}

Try adjusting degreeOfParallelism and bufferSize in this helper class:

class MergedCsv : IEnumerable<string>
{
    public MergedCsv(string path, int degreeOfParallelism = 5, int bufferSize = 4096)
    {
        Path = path;
        DegreeOfParallelism = degreeOfParallelism;
        BufferSize = bufferSize;
    }

    public void WriteTo(string path) =>
        WriteTo(File.OpenWrite(path));

    public void WriteTo(Stream stream)
    {
        using (var buffer = new BufferedStream(stream, BufferSize))
        using (var writer = new StreamWriter(buffer))
            foreach (var line in this)
                writer.WriteLine(line);
    }

    public IEnumerator<string> GetEnumerator() => Header.Concat(Data).GetEnumerator();
    IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();

    public IEnumerable<string> Header => Files
        .Select(f => File.ReadLines(f).Take(1))
        .First();

    public IEnumerable<string> Data => Files
        .AsParallel()
        .WithDegreeOfParallelism(DegreeOfParallelism)
        .Select(f => File.ReadAllLines(f).Skip(1))
        .AsSequential() // Should you preserve source file line order?
        .SelectMany(f => f);

    IEnumerable<string> Files => 
        Directory.GetFiles(Path, "*.csv", SearchOption.TopDirectoryOnly); 

    string Path { get; }
    int DegreeOfParallelism { get; }
    int BufferSize { get; }
}
Source Link
Dmitry Nogin
  • 6.1k
  • 3
  • 21
  • 40

I would try to buffer the output and read files in parallel. You could try it this way:

class Program
{
    static void Main(string[] args)
    {
        var f = new MergedCsv(@"C:\a1");
        f.WriteTo(@"C:\MasterFileA1.csv");
    }
}

Try adjusting degreeOfParallelism and bufferSize in this helper class:

class MergedCsv : IEnumerable<string>
{
    public MergedCsv(string path, int degreeOfParallelism = 5, int bufferSize = 4096)
    {
        Path = path;
        DegreeOfParallelism = degreeOfParallelism;
        BufferSize = bufferSize;
    }

    public void WriteTo(string path) =>
        WriteTo(File.OpenWrite(path));

    public void WriteTo(Stream stream)
    {
        using (var buffer = new BufferedStream(stream, BufferSize))
        using (var writer = new StreamWriter(buffer))
            foreach (var line in this)
                writer.WriteLine(line);
    }

    public IEnumerator<string> GetEnumerator() => Header.Concat(Data).GetEnumerator();
    IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();

    public IEnumerable<string> Header => Files
        .Select(f => File.ReadLines(f).Take(1))
        .First();

    public IEnumerable<string> Data => Files
        .AsParallel()
        .WithDegreeOfParallelism(DegreeOfParallelism)
        .Select(f => File.ReadAllLines(f).Skip(1))
        .AsSequential() // Should you preserve source file line order?
        .SelectMany(f => f);

    IEnumerable<string> Files => Directory.GetFiles(Path, "*.csv", SearchOption.TopDirectoryOnly);
    string Path { get; }
    int DegreeOfParallelism { get; }
    int BufferSize { get; }
}