I am writing a more simple version of the Boyer Moore algorithm, and I need to use circular buffer because there could be a really big input. The program should write all positions of symbols which have been compared with a pattern. But my program failed tests when I committed it to a gitlab server. These testes check undefined behaviour in the program, so the reason of their failure is quite obvious. However, I can't spot UB. The only hint is an error message but I don't understand how to use these byte addresses to solve my problem. Also, How "pat" can be underflowed if it is just a basic string?
#include <stdio.h>
#define MAX_PATTERN_LEN 16
#define BUF_SIZE 16
#define ALPH_SIZE 256
void calculate_shifts(const unsigned char *str, int len_str, int *shifts) {
for (int i = 0; i < ALPH_SIZE; i++)
shifts[i] = len_str;
for (int i = 0; i < len_str - 1; i++)
shifts[str[i]] = len_str - 1 - i;
}
int read_str(int max_len, unsigned char *str_place, int *is_patread, int *ind_EOF) {
int count_read = 0;
int ch = EOF;
for (int i = 0; i < max_len; i++) {
ch = getchar();
if (ch == EOF) {
if (!(*ind_EOF))
*ind_EOF = 1;
break;
}
if ((ch == '\n') && (*is_patread == 0)) {
*is_patread = 1;
break;
}
str_place[i] = (char) ch;
count_read++;
}
return count_read;
}
typedef struct {
unsigned char *buf;
int idxIn;
int idxOut;
int size;
} Ring_buf;
void Ring_put(Ring_buf *ring, unsigned char symbol) {
ring->buf[ring->idxIn++] = symbol;
if (ring->idxIn >= ring->size)
ring->idxIn = 0;
}
void Ring_push_idxOut(Ring_buf *ring, int jump) {
int new_indxOut = ring->idxOut + jump;
if (new_indxOut >= ring->size) new_indxOut -= ring->size;
ring->idxOut = new_indxOut;
}
void Ring_init(Ring_buf *ring, unsigned char *buf, int size) {
ring->size = size;
ring->buf = buf;
ring->idxIn = 0;
ring->idxOut = 0;
}
unsigned char Ring_showch(const Ring_buf *ring, int idx) {
int actual_idx = ring->idxOut + idx;
if (actual_idx >= ring->size)
actual_idx -= ring->size;
return ring->buf[actual_idx];
}
int Ring_put_nchars(Ring_buf *ring, int num, int ind_EOF) {
if (ind_EOF) return 0;
int count_read = 0;
int ch = EOF;
for (int i = 0; i < num; i++) {
ch = getchar();
if (ch == EOF)
break;
Ring_put(ring, ch);
count_read++;
}
return (count_read == num);
}
void search_substr(Ring_buf *ring, const unsigned char *pat, int pat_len, int ind_EOF) {
int shift = 0, local_shift = 0, shift_addition = 0, shifts[ALPH_SIZE];
calculate_shifts(pat, pat_len, shifts);
for (;;) {
while (local_shift <= (ring->size - pat_len)) {
int j = pat_len - 1;
for (; (Ring_showch(ring, local_shift + j) == pat[j]) && (j >= 0) ; j--)
printf("%d ", shift + local_shift + j + 1);
if (j < 0) {
shift_addition = shifts[pat[pat_len - 1]];
local_shift += shift_addition;
} else {
printf("%d ", shift + local_shift + j + 1);
shift_addition = shifts[Ring_showch(ring, local_shift + j)];
if (j == 0)
shift_addition = pat_len;
if ((shift_addition == pat_len) && (j < pat_len - 1) && (pat[pat_len - 1] == pat[0]))
shift_addition--;
local_shift += shift_addition;
}
}
int req_addplace = pat_len - 1 - (ring->size - 1 - local_shift);
shift += req_addplace;
if (!Ring_put_nchars(ring, req_addplace, ind_EOF))
break;
Ring_push_idxOut(ring, req_addplace);
local_shift -= req_addplace;
}
}
int main(void) {
unsigned char pat[MAX_PATTERN_LEN + 1] = {'\0'};
unsigned char str[BUF_SIZE] = {'\0'};
int is_patread = 0, ind_EOF = 0;
int pat_len = read_str(MAX_PATTERN_LEN + 1, pat, &is_patread, NULL);
int str_len = read_str(BUF_SIZE, str, &is_patread, &ind_EOF);
if (!str_len || !pat_len)
return 0;
Ring_buf ring;
Ring_init(&ring, str, str_len);
search_substr(&ring, pat, pat_len, ind_EOF);
return 0;
}
input: example\nthis is simple example
output: 7, 14, 13, 12, 11, 10, 20, 22, 21, 20, 19, 18, 17, 16
the error message:
==284==ERROR: AddressSanitizer: stack-buffer-underflow on address 0x7ffd3cdb915f at pc 0x00000040166d bp 0x7ffd3cdb8b90 sp 0x7ffd3cdb8b80
READ of size 1 at 0x7ffd3cdb915f thread T0
#0 0x40166c in search_substr /builds/J6i_AJma/0/c_programming_autumn/23203/i.bogachenkov/template/lab1-0/src/main.c:87
#1 0x40187b in main /builds/J6i_AJma/0/c_programming_autumn/23203/i.bogachenkov/template/lab1-0/src/main.c:121
#2 0x7f068531ab96 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b96)
#3 0x400b69 in _start (/builds/J6i_AJma/0/c_programming_autumn/23203/i.bogachenkov/template/build_lab1-0/lab1-0+0x400b69)
Address 0x7ffd3cdb915f is located in stack of thread T0 at offset 31 in frame
#0 0x4016af in main /builds/J6i_AJma/0/c_programming_autumn/23203/i.bogachenkov/template/lab1-0/src/main.c:111
This frame has 5 object(s):
[32, 49) 'pat:112' <== Memory access at offset 31 underflows this variable
[96, 112) 'str:113'
[128, 132) 'is_patread:114'
[144, 148) 'ind_EOF:114'
[160, 184) 'ring:119'
HINT: this may be a false positive if your program uses some custom stack unwind mechanism or swapcontext
(longjmp and C++ exceptions *are* supported)
SUMMARY: AddressSanitizer: stack-buffer-underflow /builds/J6i_AJma/0/c_programming_autumn/23203/i.bogachenkov/template/lab1-0/src/main.c:87 in search_substr
Shadow bytes around the buggy address:
0x1000279af1d0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000279af1e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000279af1f0: 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3 f3 f3 f3
0x1000279af200: f3 f3 f3 f3 f3 f3 f3 f3 00 00 00 00 00 00 00 00
0x1000279af210: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
=>0x1000279af220: 00 00 00 00 00 00 00 00 f1 f1 f1[f1]00 00 01 f2
0x1000279af230: f2 f2 f2 f2 00 00 f2 f2 04 f2 04 f2 00 00 00 f3
0x1000279af240: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
0x1000279af250: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000279af260: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000279af270: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
==284==ABORTING
main()
/ read_str()
: int pat_len = read_str(MAX_PATTERN_LEN, pat, &is_patread, NULL)
is undefined behavior (segfault) when ind_EOF
is dereferenced. Either treat it an error or only dereference ind_EOF
if not NULL
:
if(ind_EOF)
*ind_EOF = 1;
read_str()
: Prefer to initialize out parameters instead of relying on caller to do it (before the loop):
if(ind_EOF)
*ind_EOF = 0;
read_str()
: The out parameter str_place
should be a string, i.e. '\0'
terminated, if successful. The minimal fix is to pass in max_len
one less than the array size as you already zero initialize the input strings:
unsigned char pat[MAX_PATTERN_LEN + 1] = {0};
unsigned char str[BUF_SIZE + 1] = {0};
int is_patread = 0, ind_EOF = 0;
int pat_len = read_str(MAX_PATTERN_LEN, pat, &is_patread, NULL);
int str_len = read_str(BUF_SIZE, str, &is_patread, &ind_EOF);
read_str()
: Consider using fgets()
and eliminate both the is_patread
and ind_EOF
arguments as only the 2nd caller uses ind_EOF
.
size_t read_str(size_t n, char *s) {
return fgets(s, n, stdin) ? strlen(s) : 0;
}
... and don't read more input from the user if you know you are going to return anyways (!pat_len
):
int main(void) {
char pat[MAX_PATTERN_LEN];
size_t pat_len = read_str(MAX_PATTERN_LEN, pat);
if(!pat_len) return 0;
char str[BUF_SIZE];
size_t str_len = read_str(BUF_SIZE, str);
if(!str_len) return 0;
int ind_EOF = feof(stdin);
// ...
search_substr()
: Check that j
is in the expected range before using it (i.e. swap the order of the two expressions of the controlling expression):
for (; ((j >= 0) && Ring_showch(ring, local_shift + j) == pat[j]); j--)
Ring_put_nchars()
: With example input the blocks in in the call ch = getchar()
. I don't know if it's expected behavior but it's strange to read in a function that is meant to write.
read_str()
: str_place[i] = (char) ch;
it's a little odd that you don't use (unsigned char)
.