Have put together the code below to read a particular set of CSV files. It works but is very much a work in progress. There is one section of the code (populating datatable row - see snip below) that is taking as long to run as the SqlBulkCopy operation. Asking for advice/recommendations on how to improve performance.
In the code (below) processing a ~15M row file in 50K batches took just under 11.5 minutes. Breaking down the sections. SqlBulkCopy took ~236Kms (4 min) the reader only needed 105Kms (~1.5min), and the section populating the datatable took ~200Kms (3.33 min).
csvTableTimer.Start();
// Process row and populate datatable
DataRow dr = dt.NewRow();
foreach (DataColumn dc in dt.Columns)
{
if (row.GetType().GetProperty(dc.ToString()).GetValue(row) != null)
{
dr[dc.ToString()] = row.GetType().GetProperty(dc.ToString()).GetValue(row);
}
}
dt.Rows.Add(dr);
csvTableTimer.Stop();
The CSV files are very large (10+GB) and do not have headers. I'm using the Class to build the datatable structure and like to continue with that approach when populating the datatable rows as I'll need to expand this to work with multiple CSV types.
The datatable reflects the column names from the class which line up with the SQL DB table. Had wanted to use GetField (converted, not raw) walking each column in the datatable row[column.ColumnName] = csv.GetField( column.DataType, column.ColumnName );
but kept getting an error about there not being headers. Found an open issue relating to HasHeaderRecord = false that matches up with what I was trying to do so that added to my desire to seek advice from those who are more skilled at this. Appreciate the help!
Expanding on the code block;
var rconfig = new CsvHelper.Configuration.CsvConfiguration(CultureInfo.InvariantCulture)
{
BufferSize = 1024,
Delimiter = ",",
AllowComments = true,
HasHeaderRecord = false,
HeaderValidated = null,
IgnoreBlankLines = true,
MissingFieldFound = null,
Comment = '#',
Escape = '"',
TrimOptions = TrimOptions.Trim,
BadDataFound = x =>
{
isBadRecord = true;
ErrRecords.Add(x.RawRecord);
++badCount;
}
};
var loadFType = @"B";
// Create datatable using class as definition.
PropertyDescriptorCollection props1 = TypeDescriptor.GetProperties(loaderFileType);
DataTable dt = new DataTable();
dt = UtilExtensions.CreateDataTable(props1);
using (var reader = new StreamReader(rFile))
{
reader.ReadLine();
using (var csv = new CsvReader(reader, rconfig))
{
switch (loadFType)
{
case "ALL":
csv.Context.RegisterClassMap<CSVLoader.AMap>();
var allRecords = new List<CSVLoader.A>();
break;
case "BAL":
csv.Context.RegisterClassMap<CSVLoader.BMap>();
var balRecords = new List<CSVLoader.B>();
break;
case "CIF":
csv.Context.RegisterClassMap<CSVLoader.CMap>();
var cifRecords = new List<CSVLoader.C>();
break;
}
dt.BeginLoadData();
while (csv.Read())
{
csvReadTimer.Start();
var row = csv.GetRecord(loaderFileType);
csvReadTimer.Stop();
runningCount++;
if (!isBadRecord)
{
csvTableTimer.Start();
// Process row and populate datatable
DataRow dr = dt.NewRow();
foreach (DataColumn dc in dt.Columns)
{
if (row.GetType().GetProperty(dc.ToString()).GetValue(row) != null)
{
dr[dc.ToString()] = row.GetType().GetProperty(dc.ToString()).GetValue(row);
}
}
dt.Rows.Add(dr);
csvTableTimer.Stop();
++goodCount;
if (batchCount >= dtbatchSize || runningCount >= fileRecCount)
{
try
{
// Write from the source to the destination.
bcpLoadTimer.Start();
bulkCopy.WriteToServer(dt);
bcpLoadTimer.Stop();
bcpLoadBatchCount++;
}
catch (Exception ex)
{
}
dt.Clear();
batchCount = 0;
}
batchCount++;
}
isBadRecord = false;
}
dt.EndLoadData();
reader.Close();
dt.Clear();
transaction.Commit();
// B
public class B
{
[Index(0)]
public string A { get; set; }
[Index(1)]
public string BString { get; set; }
[Index(2)]
public int? C { get; set; }
[Index(3)]
public string D { get; set; }
[Index(4)]
public string E { get; set; }
[Index(5)]
public DateTime? F { get; set; }
[Index(6)]
public decimal? G { get; set; }
[Index(7)]
public decimal? H { get; set; }
[Index(8)]
public decimal? I { get; set; }
[Index(9)]
public decimal? J { get; set; }
[Index(10)]
public int? K { get; set; }
[Index(11)]
public string L { get; set; }
[Index(12)]
public DateTime? M { get; set; }
}
// B
public sealed class BMap : ClassMap<B>
{
public BMap()
{
// AutoMap(CultureInfo.InvariantCulture);
Map(m => m.A).Index(0);
Map(m => m.BString).Index(1);
Map(m => m.C).Index(2);
Map(m => m.D).Index(3);
Map(m => m.E).Index(4);
Map(m => m.F).Index(5).TypeConverterOption.Format("yyyyMMdd");
Map(m => m.G).Index(6);
Map(m => m.H).Index(7);
Map(m => m.I).Index(8);
Map(m => m.J).Index(9);
Map(m => m.K).Index(10);
Map(m => m.L).Index(11);
Map(m => m.M).Index(12).TypeConverterOption.Format("yyyy-MM-dd-hh.mm.ss.ffffff");
}
}
Your question doesn't really include a minimal reproducible example, so I simplified your code to create the following FileLoader
class that times how long it takes to populate the DataTable
from instances of some class TClass
(here B
) that had been read from a CSV row using CsvReader
:
public class FileLoader
{
public System.Diagnostics.Stopwatch csvTableTimer { get; } = new();
public long Load<TClass, TClassMap>(string rFile, int dtbatchSize) where TClassMap : ClassMap<TClass>, new()
{
bool isBadRecord = false;
long badCount = 0;
long runningCount = 0;
long goodCount = 0;
long batchCount = 0;
var rconfig = CreateCsvConfiguration(
x =>
{
isBadRecord = true;
//ErrRecords.Add(x.RawRecord);
++badCount;
});
// Create datatable using class as definition.
var dt = UtilExtensions.CreateDataTable(typeof(TClass));
using (var reader = new StreamReader(rFile))
{
//reader.ReadLine(); FIXED - THIS SKIPPED THE FIRST LINE AND CAUSED A RECORD TO BE OMITTED.
using (var csv = new CsvReader(reader, rconfig))
{
csv.Context.RegisterClassMap<TClassMap>();
dt.BeginLoadData();
while (csv.Read())
{
isBadRecord = false;
//csvReadTimer.Start();
var record = csv.GetRecord<TClass>();
//csvReadTimer.Stop();
runningCount++;
if (!isBadRecord)
{
csvTableTimer.Start();
// Process row and populate datatable
DataRow dr = dt.NewRow();
foreach (DataColumn dc in dt.Columns)
{
if (record.GetType().GetProperty(dc.ToString()).GetValue(record) != null)
{
dr[dc.ToString()] = record.GetType().GetProperty(dc.ToString()).GetValue(record);
}
}
dt.Rows.Add(dr);
csvTableTimer.Stop();
goodCount++;
if (++batchCount >= dtbatchSize)
{
// Flush the data table
FlushTable(dt);
batchCount = 0;
}
}
}
dt.EndLoadData();
FlushTable(dt);
Commit();
}
}
return goodCount;
}
protected virtual void FlushTable(DataTable dt) => dt.Clear(); // Replace with SqlBulkCopy
protected virtual void Commit() {} // Replace with transaction.Commit();
public static CsvConfiguration CreateCsvConfiguration(BadDataFound badDataFound) =>
new CsvHelper.Configuration.CsvConfiguration(CultureInfo.InvariantCulture)
{
BufferSize = 1024,
Delimiter = ",",
AllowComments = true,
HasHeaderRecord = false,
HeaderValidated = null,
IgnoreBlankLines = true,
MissingFieldFound = null,
Comment = '#',
Escape = '"',
TrimOptions = TrimOptions.Trim,
BadDataFound = badDataFound,
};
}
public static partial class UtilExtensions
{
static IEnumerable<PropertyInfo> GetSerializableProperties(this Type type) =>
type.GetProperties().Where(p => p.GetIndexParameters().Length == 0 && p.CanRead && p.CanWrite && p.GetGetMethod() != null && p.GetSetMethod() != null);
public static DataTable CreateDataTable(Type type)
{
var dt = new DataTable();
foreach (var p in type.GetSerializableProperties())
dt.Columns.Add(p.Name, Nullable.GetUnderlyingType(p.PropertyType) ?? p.PropertyType);
return dt;
}
}
Then, if I use the file loader and call loader.Load<B, BMap>(rFile, 1000)
to read a CSV file with 5555 rows 20 times, it takes roughly 1049 ms on dotnetfiddle. See demo #1 here.
One problem you are encountering is that reflection in c# can be very slow. You are calling record.GetType().GetProperty(dc.ToString()).GetValue(record)
twice, and if I simply reduce the number of calls by 1, the time is reduced to around 706 ms:
foreach (DataColumn dc in dt.Columns)
{
var value = record.GetType().GetProperty(dc.ToString()).GetValue(record);
if (value != null)
{
dr[dc.ToString()] = value;
}
}
Demo #2 here.
However, we can do better by manufacturing a delegate in runtime. First, add the following utility methods that make use of the System.Linq.Expressions
namespace:
public static partial class UtilExtensions
{
public static Func<TSource, object> CreatePropertyGetter<TSource>(PropertyInfo propertyInfo)
{
var parameter = Expression.Parameter(typeof(TSource), "obj");
var property = Expression.Property(parameter, propertyInfo);
var convert = Expression.Convert(property, typeof(object));
var lambda = Expression.Lambda(typeof(Func<TSource, object>), convert, parameter);
return (Func<TSource, object>)lambda.Compile();
}
public static ReadOnlyDictionary<string, Func<TSource, object>> PropertyGetters<TSource>() => PropertyExpressionsCache<TSource>.PropertyGetters;
static ReadOnlyDictionary<string, Func<TSource, object>> CreatePropertyGetters<TSource>() =>
typeof(TSource)
.GetSerializableProperties()
.ToDictionary(p => p.Name,
p => CreatePropertyGetter<TSource>(p))
.ToReadOnly();
static class PropertyExpressionsCache<TSource>
{
public static ReadOnlyDictionary<string, Func<TSource, object>> PropertyGetters { get; } = UtilExtensions.CreatePropertyGetters<TSource>();
}
public static ReadOnlyDictionary<TKey, TValue> ToReadOnly<TKey, TValue>(this IDictionary<TKey, TValue> dictionary) =>
new ReadOnlyDictionary<TKey, TValue>(dictionary ?? throw new ArgumentNullException());
}
And modify Load<TClass, TClassMap>()
as follows:
public long Load<TClass, TClassMap>(string rFile, int dtbatchSize) where TClassMap : ClassMap<TClass>, new()
{
bool isBadRecord = false;
long badCount = 0;
long runningCount = 0;
long goodCount = 0;
long batchCount = 0;
var rconfig = CreateCsvConfiguration(
x =>
{
isBadRecord = true;
//ErrRecords.Add(x.RawRecord);
++badCount;
});
var loaderFileType = typeof(TClass);
// Create datatable using class as definition.
var dt = UtilExtensions.CreateDataTable(loaderFileType);
var properties = UtilExtensions.PropertyGetters<TClass>();
using (var reader = new StreamReader(rFile))
{
//reader.ReadLine(); FIXED - THIS SKIPPED THE FIRST LINE AND CAUSED A RECORD TO BE OMITTED.
using (var csv = new CsvReader(reader, rconfig))
{
csv.Context.RegisterClassMap<TClassMap>();
dt.BeginLoadData();
while (csv.Read())
{
isBadRecord = false;
//csvReadTimer.Start();
var record = csv.GetRecord<TClass>();
//csvReadTimer.Stop();
runningCount++;
if (!isBadRecord)
{
csvTableTimer.Start();
// Process row and populate datatable
DataRow dr = dt.NewRow();
foreach (var p in properties)
{
var value = p.Value(record);
if (value != null)
dr[p.Key] = value;
}
dt.Rows.Add(dr);
csvTableTimer.Stop();
goodCount++;
if (++batchCount >= dtbatchSize)
{
// Flush the data table
FlushTable(dt);
batchCount = 0;
}
}
}
dt.EndLoadData();
FlushTable(dt);
}
}
return goodCount;
}
The time will be further reduced, to roughly 404 ms. Demo fiddle #3 here.
I also tried using Delegate.CreateDelegate()
instead of Expression
:
public static partial class UtilExtensions
{
static Func<TSource, object> CreateTypedPropertyGetter<TSource, TValue>(PropertyInfo propertyInfo)
{
var typedFunc = (Func<TSource, TValue>)Delegate.CreateDelegate(typeof(Func<TSource, TValue>), propertyInfo.GetGetMethod());
return i => (object)typedFunc(i);
}
public static Func<TSource, object> CreatePropertyGetter<TSource>(PropertyInfo propertyInfo)
{
var typedCreator = typeof(UtilExtensions).GetMethod(nameof(CreateTypedPropertyGetter), BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);
var concreteTypedCreator = typedCreator = typedCreator.MakeGenericMethod(typeof(TSource), propertyInfo.PropertyType);
return (Func<TSource, object>)concreteTypedCreator.Invoke(null, new object [] { propertyInfo });
}
public static ReadOnlyDictionary<string, Func<TSource, object>> PropertyGetters<TSource>() => PropertyExpressionsCache<TSource>.PropertyGetters;
static ReadOnlyDictionary<string, Func<TSource, object>> CreatePropertyGetters<TSource>() =>
typeof(TSource)
.GetSerializableProperties()
.ToDictionary(p => p.Name,
p => CreatePropertyGetter<TSource>(p))
.ToReadOnly();
static class PropertyExpressionsCache<TSource>
{
public static ReadOnlyDictionary<string, Func<TSource, object>> PropertyGetters { get; } = UtilExtensions.CreatePropertyGetters<TSource>();
}
public static ReadOnlyDictionary<TKey, TValue> ToReadOnly<TKey, TValue>(this IDictionary<TKey, TValue> dictionary) =>
new ReadOnlyDictionary<TKey, TValue>(dictionary ?? throw new ArgumentNullException());
}
And got roughly the same time, of 410 ms. Demo fiddle #4 here.
Notes:
The code in your question skips the first line of the CSV file, by calling reader.ReadLine();
.
In my test harness this caused an incorrect number of records to be read, so I removed this line.
Rather than having a non-generic method that has a switch on the record type, I extracted a generic method that takes the record type and class map type as generic parameters. This makes delegate creation a little easier as it is no longer necessary to do runtime casting to the record type.