For most things I write, I usually don't care about memory usage, however, I have a console app written in .Net8.02 using EF8, that leaks 2MB every time an EF transaction is called. I've tried every conceivable arrangement of GC (garbage collector) trying to force .Net to release memory between calls, but the leak persists. I've also put code snippets in all sorts of places to track memory usage and it all comes back to EF. 2MB may not seem like much, but, daily, this program reads in about 5,000 log files (or more) and the program crashes the machine (VM or real) - I wrote a memory monitor with GC to look at available memory, then stop processing and wait until it goes back down (it never does!).
Anyway, this is the area where the problem lies (LogFileProcessor.cs). If anyone has any ideas on how to release memory that EF gobbles up, I'd greatly appreciate it (before I go back and use ADO.Net Core (which does not have the same issue - go figure)
public partial class LogFileProcessor(LogDbContext dbContext, ILogger<LogFileProcessor> logger)
{
private readonly LogDbContext _dbContext = dbContext;
private readonly ILogger<LogFileProcessor> _logger = logger;
private List<LogEntry> _logEntries = new List<LogEntry>();
private string[] _lines = [];
public async Task<bool> ProcessLogFileAsync(string filePath)
{
bool result = false;
if (!File.Exists(filePath))
{
_logger.LogError("File not found: {filePath}", filePath);
return result;
}
string fileName = Path.GetFileName(filePath);
if (await LogAlreadyProcessedAsync(fileName))
{
_logger.LogInformation("Log file already processed: {fileName}", fileName);
return result;
}
string fileNameNoExt = Path.GetFileNameWithoutExtension(filePath);
DateTime fileDate = File.GetLastWriteTime(filePath);
string fileHash = ProgramBase.ComputeSha256Hash(filePath);
int logFileId = ExtractLogFileId(fileNameNoExt);
string fileType = ExtractFileType(fileNameNoExt);
using var transaction = await _dbContext.Database.BeginTransactionAsync();
try
{
var parsedLog = new ParsedLog
{
FileName = fileName,
LogType = fileType,
LogFileId = logFileId,
DateParsed = DateTime.UtcNow,
FileDate = fileDate,
FileHash = fileHash
};
await _dbContext.ParsedLogs.AddAsync(parsedLog);
await _dbContext.SaveChangesAsync();
int parsedLogId = parsedLog.Id; //retrieve new Id (identity) from ParsedLogs table
//_lines = await File.ReadLinesAsync(filePath).ToArray(); //not really needed, but if user sets log file size really large, this is better for resources
_lines = await File.ReadAllLinesAsync(filePath);
int lineNum = 0;
foreach (var line in _lines)
{
var entry = ParseLine(line, parsedLogId, lineNum);
if (entry != null)
{
_logEntries.Add(entry);
}
else
{
throw new Exception($"Unable to parse or convert line {lineNum}");
}
lineNum += 1;
}
await _dbContext.LogEntries.AddRangeAsync(_logEntries);
await _dbContext.SaveChangesAsync();
await transaction.CommitAsync();
_logger.LogInformation("Log file: {fileName} processed and data committed to the database.", fileName);
await transaction.DisposeAsync();
result = true;
}
catch (Exception ex)
{
await transaction.RollbackAsync();
_logger.LogError("Error processing log file: {fileName} {ex.Message}", fileName, ex.Message);
await transaction.DisposeAsync();
result = false;
}
finally
{
_logEntries.Clear();
_lines = [];
// Force garbage collection - naturally, this doesn't work, UGH!
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
}
return result;
}
private async Task<bool> LogAlreadyProcessedAsync(string fileName)
{
return await _dbContext.ParsedLogs.AsNoTracking().AnyAsync(l => l.FileName == fileName);
}
private static string ExtractFileType(string fileNameNoExt)
{
var match = FileTypeRegex().Match(fileNameNoExt);
return match.Success ? match.Groups[1].Value : "unknown";
}
private static int ExtractLogFileId(string fileNameNoExt)
{
var match = FileIdRegex().Match(fileNameNoExt);
return match.Success ? int.Parse(match.Groups[1].Value) : 0;
}
private static LogEntry? ParseLine(string line, int parsedLogId, int lineNum)
{
var parts = line.Split("->", StringSplitOptions.TrimEntries);
if (parts.Length < 2) return null;
var dateTimePart = parts[0].Trim();
string ipPart = string.Empty;
string statusAndRestPart;
// Check if the IP address is present
if (parts.Length == 3)
{
ipPart = parts[1].Trim();
statusAndRestPart = parts[2].Trim();
}
else
{
// Assume the IP address is missing and adjust accordingly
statusAndRestPart = parts[1].Trim();
}
var statusPart = statusAndRestPart.Split(':', StringSplitOptions.TrimEntries)[0];
var actionDetailsPart = ActionDetailsRegex().Match(statusAndRestPart);
string action = actionDetailsPart.Groups[1].Value.Trim();
string details = actionDetailsPart.Groups.Count > 2 ? actionDetailsPart.Groups[2].Value.Trim() : string.Empty;
return new LogEntry
{
ParsedLogId = parsedLogId,
LineNum = lineNum,
EntryDate = DateTime.ParseExact(dateTimePart, "ddd, dd MMM yyyy HH:mm:ss", CultureInfo.InvariantCulture),
IPaddress = ipPart,
Status = statusPart,
Action = action,
Details = details
};
}
// generates all regexes at compile time
[GeneratedRegex(@"^(.*?)_\d+$")]
private static partial Regex FileTypeRegex();
[GeneratedRegex(@"_([0-9]+)$")]
private static partial Regex FileIdRegex();
[GeneratedRegex(@"Action=\[(.*?)\](?:, Details=\[(.*?)\])?", RegexOptions.Compiled)]
private static partial Regex ActionDetailsRegex();
}
The program.cs file:
namespace LogParserApp;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Configuration;
using System;
using System.IO;
using Microsoft.EntityFrameworkCore;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
internal partial class Program : ProgramBase
{
public static async Task Main(string[] args)
{
var settings = ParseArguments(args);
if (!settings.TryGetValue("filetype", out List<string>? value) || value.Count == 0)
{
Console.WriteLine("Please specify at least one filetype using '-filetype \"smtp, pop3\"'.");
return;
}
var host = CreateHostBuilder(args).Build();
// Access the configuration and the LogFileProcessor service
var config = host.Services.GetRequiredService<IConfiguration>();
string? folderPath = settings.TryGetValue("folderpath", out List<string>? value1) && value1.Count > 0 ? value1[0]
: config["LogFileSettings:FolderPath"];
string? archivePath = settings.TryGetValue("archivepath", out List<string>? value2) && value2.Count > 0 ? value2[0]
: config["LogFileSettings:ArchivePath"];
var logFileProcessor = host.Services.GetRequiredService<LogFileProcessor>();
string postProcess = settings.TryGetValue("postprocess", out List<string>? value3) && value3.Count > 0 ? value3[0].ToLower() : "keep";
foreach (var fileType in value)
{
var logFiles = Directory.GetFiles(folderPath ?? "C:\\logs", $"{fileType}_*.txt")
.Select(file => new
{
FileName = file,
OrderKey = int.Parse(OrderKeyRegex().Match(Path.GetFileName(file)).Groups[1].Value)
})
.OrderBy(f => f.OrderKey)
.Select(f => f.FileName);
//long memOffset = GC.GetTotalMemory(forceFullCollection: true); //for tracking memory
foreach (var file in logFiles)
{
// EnsureAvailableMemory(); //to keep program from crashing, no joy
//long startMem = GC.GetTotalMemory(forceFullCollection: true); //for tracking memory
Console.WriteLine($"Processing file: {file}");
var processSuccess = (await logFileProcessor.ProcessLogFileAsync(file));
if (processSuccess)
{
switch (postProcess)
{
case "archive":
string targetPath = Path.Combine(archivePath ?? "C:\\logs\\archive", Path.GetFileName(file));
File.Move(file, targetPath);
Console.WriteLine($"Archived file to: {targetPath}");
break;
case "delete":
File.Delete(file);
Console.WriteLine($"Deleted file: {file}");
break;
case "keep":
// Nothing to do, may add something later to keep, but rename, or what-have-you
break;
}
}
else
{
Console.WriteLine($"Processing failed for file: {file}, skipping post-processing steps.");
}
GC.Collect(0, GCCollectionMode.Forced);
//long endMem = GC.GetTotalMemory(forceFullCollection: true); //for tracking memory
//Console.WriteLine($"Memory Utilized: {(endMem - startMem) / 1048576M:N2} MB"); //for tracking memory
//Console.WriteLine($"Running Memory: {(endMem - memOffset) / 1048576M:N2} MB"); //for tracking memory
}
}
await host.RunAsync();
}
static IHostBuilder CreateHostBuilder(string[] args) =>
Host.CreateDefaultBuilder(args)
.ConfigureAppConfiguration((hostingContext, config) =>
{
config.SetBasePath(Directory.GetCurrentDirectory());
config.AddJsonFile("appsettings.json", optional: false, reloadOnChange: true);
})
.ConfigureServices((hostContext, services) =>
{
services.AddDbContext<LogDbContext>(options =>
options.UseSqlServer(hostContext.Configuration.GetConnectionString("DefaultConnection")));
services.AddScoped<LogFileProcessor>();
services.AddLogging();
services.AddSingleton<IConfiguration>(hostContext.Configuration);
})
.ConfigureLogging(logging => {
logging.ClearProviders();
logging.AddConsole();
logging.AddFilter("Microsoft.EntityFrameworkCore.Database.Command", LogLevel.Warning);
});
// generates a regex at compile time
[GeneratedRegex(@"^.*?_(\d+)\.txt$")]
private static partial Regex OrderKeyRegex();
// this doesn't help - garbage collection never actually occurs, so it stays at 1GB & tries again indefinitely
public static void EnsureAvailableMemory()
{
const long maxAllowedMemory = 1_073_741_824; // Set threshold to 1 GB
while (true)
{
long memoryUsed = GC.GetTotalMemory(false);
Console.WriteLine($"Memory used: {memoryUsed} bytes");
if (memoryUsed < maxAllowedMemory)
{
break;
}
Console.WriteLine("Memory usage is too high, forcing garbage collection.");
GC.Collect();
GC.WaitForPendingFinalizers();
Console.WriteLine("Garbage collection complete, pausing for a few seconds...");
Thread.Sleep(5000); // Wait 5 seconds before checking again
}
}
}
Note the last method (I've tried various things with the GC, but no joy).
The entities (probably won't help, but here they are)
public class ParsedLog
{
public int Id { get; set; }
public string FileName { get; set; } = string.Empty;
public string LogType { get; set; } = string.Empty;
public int LogFileId { get; set; }
public DateTime DateParsed { get; set; }
public DateTime FileDate { get; set; }
public string? FileHash { get; set; } // SHA-256 hash of the file
}
public class LogEntry
{
public long Id { get; set; }
public int ParsedLogId { get; set; }
public int LineNum { get; set; }
public DateTime EntryDate { get; set; }
public string IPaddress { get; set; } = string.Empty;
public string Status { get; set; } = string.Empty;
public string Action { get; set; } = string.Empty;
public string Details { get; set; } = string.Empty;
}
public class LogDbContext(DbContextOptions<LogDbContext> options) : DbContext(options)
{
public DbSet<LogEntry> LogEntries { get; set; }
public DbSet<ParsedLog> ParsedLogs { get; set; }
}
I plan on refactoring for obvious speed improvements (pre-hashing the files, using spans, bulkinsert, etc), but the memory problem is pretty severe when doing thousands of files.
Here is some output to demonstrate the memory going up to 1GB 1-2MB at a time
PS D:\Projects\LogParserApp> dotnet run -filetype "smtp" -postprocess "archive"
Processing file: D:\EmailLogs\smtp_0.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_0.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_0.txt
Memory Utilized: 12.27 MB
Running Memory: 12.49 MB
Processing file: D:\EmailLogs\smtp_1.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_1.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_1.txt
Memory Utilized: 2.78 MB
Running Memory: 15.27 MB
Processing file: D:\EmailLogs\smtp_2.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_2.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_2.txt
Memory Utilized: 2.48 MB
Running Memory: 17.74 MB
Processing file: D:\EmailLogs\smtp_3.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_3.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_3.txt
Memory Utilized: 3.28 MB
Running Memory: 21.03 MB
Processing file: D:\EmailLogs\smtp_4.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_4.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_4.txt
Memory Utilized: 2.28 MB
Running Memory: 23.31 MB
Processing file: D:\EmailLogs\smtp_5.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_5.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_5.txt
Memory Utilized: 2.55 MB
Running Memory: 25.86 MB
...
...
...
Processing file: D:\EmailLogs\smtp_370.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_370.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_370.txt
Memory Utilized: 2.36 MB
Running Memory: 999.33 MB
Processing file: D:\EmailLogs\smtp_371.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_371.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_371.txt
Memory Utilized: 2.59 MB
Running Memory: 1,001.92 MB
Processing file: D:\EmailLogs\smtp_372.txt
info: LogParserApp.LogFileProcessor[0]
Log file: smtp_372.txt processed and data committed to the database.
Archived file to: D:\EmailLogs\ArchivedLogs\smtp_372.txt
Memory Utilized: 2.24 MB
Running Memory: 1,004.16 MB
That's for 373 files - imagine 10 thousand files. :)
The simplest way to handle this is to create a service scope per file;
foreach (var file in logFiles)
{
using var scope = host.Services.CreateScope();
var logFileProcessor = scope.ServiceProvider.GetRequiredService<LogFileProcessor>();
...
All your scoped services will be disposed when the scope
is disposed.