This is my current code:
// Yes, the missing single quotation mark is intentional
static void replace_punctuation(char *s, size_t len)
{
static const unsigned char punctuation[] = ".,;:!?\"()[]{}-";
for (size_t i = 0; i < len; ++i) {
if (memchr(punctuation, s[i], sizeof punctuation - 1)) {
s[i] = ' ';
}
}
}
After profiling (cachegrind/Kcachegrind) the program (compiled with optimizations enabled, -O2), this was found to be one bottleneck.
s
is not a nul-terminated string, so strpbrk()
can not be used.
How can it be optimized?
You can use a lookup table to directly replace each character:
#include <limits.h>
#include <stdlib.h>
static void replace_punctuation(char *s, size_t len)
{
_Static_assert(UCHAR_MAX == 255, "This code is written for eight-bit char.");
static const char table[UCHAR_MAX + 1] =
{
// First, we initialize table[i] with i.
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103,
104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135,
136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151,
152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183,
184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199,
200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215,
216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231,
232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 254, 255,
/* Then we replace each punctuation character with a space.
Overriding previous initializers is specified by C 2018 6.7.9
19.
*/
['.'] = ' ',
[','] = ' ',
[';'] = ' ',
[':'] = ' ',
['!'] = ' ',
['?'] = ' ',
['"'] = ' ',
['('] = ' ',
[')'] = ' ',
['['] = ' ',
[']'] = ' ',
['{'] = ' ',
['}'] = ' ',
['-'] = ' ',
};
for (size_t i = 0; i < len; ++i)
s[i] = table[(unsigned char) s[i]];
}
#include <string.h>
#include <stdio.h>
int main(void)
{
char p[] = "This, is- text: with? some [puncutation]!";
puts(p);
replace_punctuation(p, strlen(p));
puts(p);
}
To deal with possible implementation-defined behavior in conversion the initializers to a signed char
, the elements from 128 up can be initialized with i - 2*(UCHAR_MAX-CHAR_MAX)
instead of just i
.