I need to remove all occurrences of most common word from string in C.
If there are several words in the text that are repeated the same number of times, the function should remove the one of the most common words that is closest to the beginning of the string. When omitting words, you should not omit surrounding spaces and other characters. If the received string does not contain any words, the function does not need to do anything.
A word is defined as an array of uppercase and lowercase letters. The function does not need to distinguish between uppercase and lowercase letters
My algorithm is the following:
Code:
#include <stdio.h>
#include <limits.h>
#include <ctype.h>
int number_of_word_occurrence(char *s, char *start, char *end) {
int number = 0;
while (*s != '\0') {
char *p = start;
char *q = s;
while (p != end) {
if (*p != *q)break;
p++;
q++;
}
if (p == end)number++;
s++;
}
return number;
}
int length(char *s) {
char *p = s; int number = 0;
while (*p != '\0') {
p++;
number++;
}
return number;
}
char *remove_most_common(char *s) {
int n, max = INT_MIN;
char *p = s;
// Find max occurrence
while (*p != '\0') {
char *start = p;
int word_found = 0;
while (toupper(*p) >= 'A' && toupper(*p) <= 'Z' && *p != '\0') {
word_found = 1;
p++;
}
if (word_found) {
n = number_of_word_occurrence(s, start, p);
if (n > max)max = n;
}
p++;
}
p = s;
int len = length(s);
char *end = s + len;
int i;
// Removing most common word
while (p != end) {
char *start = p, *pp = p;
int word_found = 0;
while (toupper(*pp) >= 'A' && toupper(*pp) <= 'Z' && pp != end) {
word_found = 1;
pp++;
}
if (word_found) {
n = number_of_word_occurrence(s, start, pp);
// If word has max, then remove it
if (n == max) {
while (pp != end) {
*start = *pp;
start++;
pp++;
}
end -= max; // resize end of string
len-=max;
}
}
p++;
}
s[len+=2]='\0';
return s;
}
int main() {
char s[1000] = "Koristio sam auto-stop da dodjem do znaka stop ali prije stopa sam otvorio dekstop kompjutera stop";
printf("%s ", remove_most_common(s) );
return 0;
}
EXAMPLE 1: "Koristio sam auto-stop da dodjem do znaka stop ali prije stopa sam otvorio dekstop kompjutera stop"
OUTPUT: "Koristio sam auto- da dodjem do znaka ali prije stopa sam otvorio dekstop kompjutera "
EXAMPLE 2: " This is string. "
OUTPUT: " is string. "
EXAMPLE 3: "1PsT1 psT2 3Pst pstpst pst";
OUTPUT: " "11 2 3 pstpst "
EXAMPLE 4: "oneeebigggwwooorrrddd";
OUTPUT: ""
Could you help me to fix my code? I have some errors while removing characters. Also, could you help me to remove the word closest to the beginning if all of word occurrences are the same?
string.h
, stdlib.h
libraries, as well as the sprintf
and sscanf
functions from the stdio.h library are not allowed when solving the task. It is not allowed to create auxiliary strings in function or globally.All the major issues are due to the fact that the string is a source of information, while being actively altered.
In general, words are not tokenized properly.
With the input "hello world"
as an example, each of hello
, ello
, llo
, lo
, and o
are considered words when tokenizing the string while looking for the word to remove. The program only advances the string by one character when scanning for words.
The program should advance the string by the length of the current token.
number_of_word_occurrence
considers any substring a valid word when making comparisons.
For the input
Koristio sam auto-stop da dodjem do znaka stop ali prije stopa sam otvorio dekstop kompjutera stop
the maximum count is incorrectly found to be 5
, for stop
. This problem compounds with the problem above, and starts removing incorrectly tokenized data that reports this occurrence count.
To generalize, a large problem with this approach is that as you remove a word from the string, the number of occurrences is going to be different for that word, the next time it is found. Looking at
hello hello hello world world
The max occurrence count of a word here is 3
, for hello
. Looping through to remove the maximum word will see hello
the first time, check its occurrence count, find it to be 3
, the max, and remove it.
hello hello world world
For the second hello
, checking its occurrence count now will return 2
, since the string was altered. This is not the max of 3
, so the string is unchanged.
Additionally, the string is not null-terminated as it is altered - only afterwards. Meaning searching for a word might read stale data, giving bad results.
The strict limitation on what functionality the program can utilize (particularity the restrictions on dynamic memory and auxiliary buffers) does make for very exhaustive solutions.
One solution is to first discover the maximum occurrence count
of any word in the string, and hold a pointer to this word's first appearance in the string. Then, do count
times operations removing the last appearance of the word, which allows you to always have the first appearance as a point of comparison.
A word is removed by overwriting it with everything that follows it in the string (including the null-terminating byte).
Here is a cursory example - largely untested, but provides the correct results for the examples shown. Compile with -DDEBUG
to see additional information.
#include <ctype.h>
#include <stdio.h>
typedef struct word {
const char *base;
size_t length;
} word;
#define length(s) (span((s), NULL))
size_t span(const char *base, int (*test)(int))
{
const char *end = base;
while (test ? test((unsigned char) *end) : *end)
end++;
return (size_t) (end - base);
}
int eql(word a, word b)
{
#ifdef DEBUG
fprintf(stderr, "DEBUG: A{%zu}<<%.*s>> <=> B{%zu}<<%.*s>>\n",
a.length, (int) a.length, a.base,
b.length, (int) b.length, b.base);
#endif
if (!a.length || !b.length || a.length != b.length)
return 0;
if (a.base == b.base)
return 1;
for (size_t i = 0; i < a.length; i++)
if (tolower((unsigned char) a.base[i]) != tolower((unsigned char) b.base[i]))
return 0;
return 1;
}
word get_word(const char *s, const char **end)
{
word w = { 0 };
while (*s && !isalpha((unsigned char) *s))
s++;
w.base = s;
w.length = span(s, isalpha);
*end = (s + w.length);
return w;
}
word find_last(const char *s, word mark, unsigned *count)
{
word last = { 0 };
unsigned c = 0;
for (const char *end; *s; s = end) {
word current = get_word(s, &end);
if (eql(mark, current)) {
last = current;
c++;
}
}
if (count)
*count = c;
return last;
}
word find_most_common(const char *s, unsigned *count)
{
word most_common = { 0 };
*count = 0;
for (const char *end; *s; s = end) {
word current = get_word(s, &end);
if (eql(most_common, current))
continue;
unsigned c;
(void) find_last(s, current, &c);
if (c > *count) {
most_common = current;
*count = c;
}
}
return most_common;
}
void copy(char *dest, char *source, size_t length)
{
for (size_t i = 0; i < length; i++)
dest[i] = source[i];
}
void remove_most_common(char *s)
{
unsigned count = 0;
word common = find_most_common(s, &count);
#ifdef DEBUG
if (count)
fprintf(stderr, "DEBUG: MOST COMMON WORD: [[%.*s]]x%u\n",
(int) common.length, common.base, count);
#endif
size_t len = length(s);
while (count--) {
word last = find_last(s, common, NULL);
copy(
(char *) last.base,
(char *) last.base + last.length,
len - (size_t) (last.base - s) + 1);
len -= last.length;
}
}
int main(void)
{
char buffer[4096];
if (!fgets(buffer, sizeof buffer, stdin))
return 1;
size_t len = length(buffer);
if (len && buffer[len - 1] == '\n')
buffer[len - 1] = 0;
printf("<<%s>>\n", buffer);
remove_most_common(buffer);
printf("<<%s>>\n", buffer);
}