python machine-learning deep-learning pytorch neural-network

Excessive padding causes accuracy decrease to NN model

I have trained a simple neural network model to make a binary classification and be able o separate real from fake news

#Create the class of the model
class FakeNewsDetectionModelV0(nn.Module):
     def __init__(self, input_size):
        super().__init__()
        
        self.layer_1=nn.Linear(in_features=input_size, out_features=8)
        self.layer_2=nn.Linear(in_features=8, out_features=1) #takes the 5 features from the previous layer and outputs a single feature

     #define a forward() for the forward pass
     def forward(self, x, mask):
        
        # Apply the mask to ignore certain values
        if mask is not None:
            x = x * mask

        x = self.layer_1(x)
        x = self.layer_2(x)
        return x

I use CountVectorizer to turn text to list and subsequrently to tensor

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(df['text'])

X=vectorizer.fit_transform(df['text']).toarray()

The problem is that because the dataset has more than 9000 entries the input size the model is trained on is really large (around 120000). So when i try to make predictions on single sentences, because the size is significally smaller i need to excessively pad the sentence to make it fit the model's input which greatly affect my model's accuracy.

from io import StringIO
from torch.nn.functional import pad
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


try:
    #nltk.download('stopwords')
    nltk.download('punkt')
except:
    print("error in downloading stopwords")

def normalise_text (text):

  text = text.lower() # lowercase
  text = text.replace(r"\#","") # replaces hashtags
  text = text.replace(r"http\S+","URL")  # remove URL addresses
  text = text.replace(r"@","")
  text = text.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
  text = text.replace("\s{2,}", " ")
  text = re.sub(r'[^\w\s]', '', text)
  return text

def fake_news_detection(df, model, model_input_size):
    predictions = []
    max_words = 10000
    max_length = model_input_size

    model.eval()

    for prediction_data in df['text'][:4000]:
        prediction_data=normalise_text(prediction_data)

        #print([prediction_data])



        # Use CountVectorizer to transform text data to array
        vectorizer = CountVectorizer(min_df=0, lowercase=False)
        prediction_data_array = vectorizer.fit_transform([prediction_data]).toarray()

        #tokenizer = Tokenizer(num_words=max_words)
        #tokenizer.fit_on_texts([prediction_data])
        #sequences = tokenizer.texts_to_sequences([prediction_data])


        #prediction_data_array = pad_sequences(sequences, maxlen=max_length,value=-1.0)

        #print(prediction_data_array.shape)

        # Check the shape of the transformed data
        current_input_size = prediction_data_array.shape[1]


        prediction_data_tensor = torch.tensor(prediction_data_array, dtype=torch.float32)


        # If the shape doesn't match, resize it
        if current_input_size != model_input_size:

            print(current_input_size)
            padding = model_input_size - current_input_size
            prediction_data_tensor = pad(prediction_data_tensor, (0, padding), 'constant', value = 0)
            mask_tensor = torch.ones_like(prediction_data_tensor)
            mask_tensor[:, -padding:] = 0  # Set values in the padded region to 0
            #print(torch.unique(mask_tensor, return_counts=True))

            # Apply the mask to ignore certain values
            #prediction_data_tensor = prediction_data_tensor * mask_tensor



        # Assuming your model takes input_data as input
        with torch.inference_mode():
            prediction = torch.round(torch.sigmoid(model(prediction_data_tensor, mask_tensor))).squeeze()

        predictions.append(round(prediction.item()))

    print(f"our data tensor shape is  {prediction_data_tensor.shape}")

    predictions_tensor = torch.FloatTensor(predictions)

    return predictions_tensor

Does anyone know any workaround that allows me to fit the data to my model withou dropping its accuracy score ?

Tried : padding the vectors when making predictions on data that is of small size

Expected : Accurate predictions similar to the results i got in training/evaluation process

Got : Inaccurate predictions of really low accuracy (around 43%)

Solution

The problem is you are creating a different featurizer for every step of your loop. This means you are giving a totally different feature vector to the model on every step.

You need to fit the CountVectorizer once on your train dataset, then use the same fitted CountVectorizer to transform all the data.