basically I want a faster and working method to search a binary file for a byte array and grabbing the offset. The byte[] could contain between 5 - 50 bytes. It's gonna read 1 search I have a function that's not working properly and very slow:
static long ReadOneSrch(BinaryReader reader, byte[] bytes)
{
int b;
long i = 0;
while ((b = reader.BaseStream.ReadByte()) != -1)
{
if (b == bytes[i++])
{
if (i == bytes.Length)
return reader.BaseStream.Position - bytes.Length;
}
else
i = b == bytes[0] ? 1 : 0;
}
return -1;
}
Here is my implementation using Boyer-Moore-Horspool on a stream. The core BMH implementation is basically copied from Boyer-Moore-Horspool Algorithm for All Matches (Find Byte array inside Byte array).
The method repeatedly reads the stream into a buffer and applies the BMH algorithm on the buffer until we get a match. In order to also find matches spanning two such reads, we always transfer the last pattern.Length bytes from the previous read to the head of the buffer (could be done even smarter with some effort by evaluating possibe match starts already excluded - but if the pattern is not too long you will hardly notice the difference).
/// <summary>
/// Finds the first occurrence of <paramref name="pattern"/> in a stream
/// </summary>
/// <param name="s">The input stream</param>
/// <param name="pattern">The pattern</param>
/// <returns>The index of the first occurrence, or -1 if the pattern has not been found</returns>
public static long IndexOf(Stream s, byte[] pattern)
{
// Prepare the bad character array is done once in a separate step
var badCharacters = MakeBadCharArray(pattern);
// We now repeatedly read the stream into a buffer and apply the Boyer-Moore-Horspool algorithm on the buffer until we get a match
var buffer = new byte[Math.Max(2 * pattern.Length, 4096)];
long offset = 0; // keep track of the offset in the input stream
while (true)
{
int dataLength;
if (offset == 0)
{
// the first time we fill the whole buffer
dataLength = s.Read(buffer, 0, buffer.Length);
}
else
{
// Later, copy the last pattern.Length bytes from the previous buffer to the start and fill up from the stream
// This is important so we can also find matches which are partly in the old buffer
Array.Copy(buffer, buffer.Length - pattern.Length, buffer, 0, pattern.Length);
dataLength = s.Read(buffer, pattern.Length, buffer.Length - pattern.Length) + pattern.Length;
}
var index = IndexOf(buffer, dataLength, pattern, badCharacters);
if (index >= 0)
return offset + index; // found!
if (dataLength < buffer.Length)
break;
offset += dataLength - pattern.Length;
}
return -1;
}
// --- Boyer-Moore-Horspool algorithm ---
// (Slightly modified code from
// https://stackoverflow.com/questions/16252518/boyer-moore-horspool-algorithm-for-all-matches-find-byte-array-inside-byte-arra)
// Prepare the bad character array is done once in a separate step:
private static int[] MakeBadCharArray(byte[] pattern)
{
var badCharacters = new int[256];
for (long i = 0; i < 256; ++i)
badCharacters[i] = pattern.Length;
for (var i = 0; i < pattern.Length - 1; ++i)
badCharacters[pattern[i]] = pattern.Length - 1 - i;
return badCharacters;
}
// Core of the BMH algorithm
private static int IndexOf(byte[] value, int valueLength, byte[] pattern, int[] badCharacters)
{
int index = 0;
while (index <= valueLength - pattern.Length)
{
for (var i = pattern.Length - 1; value[index + i] == pattern[i]; --i)
{
if (i == 0)
return index;
}
index += badCharacters[value[index + pattern.Length - 1]];
}
return -1;
}