How would I take this and make it into a method that eliminates the repetition (i.e., Don't Repeat Yourself), but also doesn't make a new boxing/object allocation—or at least as few as is feasible.
private static IEnumerable<string> SeparateLineIntoMultipleDefinitions(string line) {
string[] splitEntries;
splitEntries = (from str in line.Split(new[] {", "}, StringSplitOptions.RemoveEmptyEntries)
where str.Contains('=')
select str).ToArray();
if (splitEntries.Length > 2) return splitEntries;
splitEntries = (from str in line.Split(',')
where str.Contains('=')
select str).ToArray();
if (splitEntries.Length > 2) return splitEntries;
splitEntries = (from str in line.Split(' ')
where str.Contains('=')
select str).ToArray();
if (splitEntries.Length > 2) return splitEntries;
return Enumerable.Empty<string>();
}
Originally I tried making a method such as this:
IEnumerable<string> SplitEntries(object splitter) {
return splitter switch {
string[] strArray => (from str in line.Split(strArray, StringSplitOptions.RemoveEmptyEntries)
where str.Contains('=')
select str),
string s => (from str in line.Split(new[] {s}, StringSplitOptions.RemoveEmptyEntries)
where str.Contains('=')
select str),
char charSplitter => (from str in line.Split(charSplitter)
where str.Contains('=')
select str),
_ => Enumerable.Empty<string>()
};
}
But alas, calling that with a char
boxes the char into an object.
For this particular scenario I want it to try parsing ", "
, then just ','
, then just ' '
. Whereas, if I were to just call
line.Split(',', ' ')
I believe it would try to split something like this=that,there x=y b=c
into this=that
, there
, x=y
, b=c
. But I don't want that extra entry there
. (My original intention was to keep there
by means of including it in the prior split, i.e., this=that,there
, but since I was unclear, the answers did not provide a solution to accomplish that but instead removes the entry there
. Therefore, I'll leave the question phrased as-is).
I would use regex, but I figure not using it would provide a more memory optimized solution (possible false premise; I'd be happy to be proven wrong). Despite what may be considered over-engineering, I am curious what the solution would be, because I have a feeling I'm missing something very simple.
Alternative way to split with ReadOnlySpan<char>
, without using Regex
, string.Split()
, and System.Linq
:
Code (UsingSpan):
public static IEnumerable<string> SeparateLineIntoMultipleDefinitions(ReadOnlySpan<char> line)
{
List<string> definitions = new List<string>();
bool captureIsStarted = false;
int equalSignCount = 0;
int lastEqualSignPosition = 0;
int captureStart = 0;
int captureEnd = 0;
for (int i = 0; i < line.Length; i++)
{
char c = line[i];
if (c != ',' && !char.IsWhiteSpace(c))
{
if (captureIsStarted)
{
captureEnd = i;
}
else
{
captureStart = i;
captureIsStarted = true;
}
if (c == '=')
{
equalSignCount++;
lastEqualSignPosition = i;
}
}
else
{
if (equalSignCount == 1 && lastEqualSignPosition > captureStart && lastEqualSignPosition < captureEnd)
{
definitions.Add(line[captureStart..(captureEnd + 1)].ToString());
}
equalSignCount = 0;
captureIsStarted = false;
}
}
if (captureIsStarted && equalSignCount == 1 && lastEqualSignPosition > captureStart && lastEqualSignPosition < captureEnd)
{
definitions.Add(line[captureStart..(captureEnd + 1)].ToString());
}
return definitions;
}
Code (UsingSpan_ZeroAllocation):
public static (int, (int, int)[]) SeparateLineIntoMultipleDefinitions_ZeroAllocation(ReadOnlySpan<char> line)
{
int count = 0;
(int, int)[] ranges = ArrayPool<(int, int)>.Shared.Rent(line.Length);
bool captureIsStarted = false;
int equalSignCount = 0;
int lastEqualSignPosition = 0;
int captureStart = 0;
int captureEnd = 0;
for (int i = 0; i < line.Length; i++)
{
char c = line[i];
if (c != ',' && !char.IsWhiteSpace(c))
{
if (captureIsStarted)
{
captureEnd = i;
}
else
{
captureStart = i;
captureIsStarted = true;
}
if (c == '=')
{
equalSignCount++;
lastEqualSignPosition = i;
}
}
else
{
if (equalSignCount == 1 && lastEqualSignPosition > captureStart && lastEqualSignPosition < captureEnd)
{
ranges[count++] = (captureStart, captureEnd + 1);
}
equalSignCount = 0;
captureIsStarted = false;
}
}
if (captureIsStarted && equalSignCount == 1 && lastEqualSignPosition > captureStart && lastEqualSignPosition < captureEnd)
{
ranges[count++] = (captureStart, captureEnd + 1);
}
return (count, ranges);
}
Example Usage:
var line = "this=that,there x=y b=c";
var (count, ranges) = SeparateLineIntoMultipleDefinitions_ZeroAllocation(line);
for (int i = 0; i < count; i++)
{
var (offset, length) = ranges[i];
Console.WriteLine(line[offset..length]);
}
ArrayPool<(int, int)>.Shared.Return(ranges);
Benchmark:
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.17763.1039 (1809/October2018Update/Redstone5)
Intel Xeon CPU E5-2696 v4 2.20GHz, 2 CPU, 88 logical and 88 physical cores
.NET Core SDK=3.1.101
[Host] : .NET Core 3.1.1 (CoreCLR 4.700.19.60701, CoreFX 4.700.19.60801), X64 RyuJIT
DefaultJob : .NET Core 3.1.1 (CoreCLR 4.700.19.60701, CoreFX 4.700.19.60801), X64 RyuJIT
| Method | Mean | Error | StdDev | Gen 0 | Gen 1 | Gen 2 | Allocated |
|------------------------- |---------:|--------:|--------:|-------:|------:|------:|----------:|
| UsingSpan_ZeroAllocation | 139.9 ns | 0.86 ns | 0.76 ns | - | - | - | - |
| UsingSpan | 176.3 ns | 1.66 ns | 1.47 ns | 0.0067 | - | - | 192 B |
| UsingRegEx | 218.2 ns | 2.62 ns | 2.45 ns | 0.0088 | - | - | 256 B |
| UsingLinq | 339.0 ns | 3.86 ns | 3.42 ns | 0.0100 | - | - | 288 B |
| UsingOPMethod | 853.0 ns | 8.80 ns | 8.23 ns | 0.0210 | - | - | 624 B |