I have trained a simple neural network model to make a binary classification and be able o separate real from fake news
#Create the class of the model
class FakeNewsDetectionModelV0(nn.Module):
def __init__(self, input_size):
super().__init__()
self.layer_1=nn.Linear(in_features=input_size, out_features=8)
self.layer_2=nn.Linear(in_features=8, out_features=1) #takes the 5 features from the previous layer and outputs a single feature
#define a forward() for the forward pass
def forward(self, x, mask):
# Apply the mask to ignore certain values
if mask is not None:
x = x * mask
x = self.layer_1(x)
x = self.layer_2(x)
return x
I use CountVectorizer to turn text to list and subsequrently to tensor
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(df['text'])
X=vectorizer.fit_transform(df['text']).toarray()
The problem is that because the dataset has more than 9000 entries the input size the model is trained on is really large (around 120000). So when i try to make predictions on single sentences, because the size is significally smaller i need to excessively pad the sentence to make it fit the model's input which greatly affect my model's accuracy.
from io import StringIO
from torch.nn.functional import pad
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
try:
#nltk.download('stopwords')
nltk.download('punkt')
except:
print("error in downloading stopwords")
def normalise_text (text):
text = text.lower() # lowercase
text = text.replace(r"\#","") # replaces hashtags
text = text.replace(r"http\S+","URL") # remove URL addresses
text = text.replace(r"@","")
text = text.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
text = text.replace("\s{2,}", " ")
text = re.sub(r'[^\w\s]', '', text)
return text
def fake_news_detection(df, model, model_input_size):
predictions = []
max_words = 10000
max_length = model_input_size
model.eval()
for prediction_data in df['text'][:4000]:
prediction_data=normalise_text(prediction_data)
#print([prediction_data])
# Use CountVectorizer to transform text data to array
vectorizer = CountVectorizer(min_df=0, lowercase=False)
prediction_data_array = vectorizer.fit_transform([prediction_data]).toarray()
#tokenizer = Tokenizer(num_words=max_words)
#tokenizer.fit_on_texts([prediction_data])
#sequences = tokenizer.texts_to_sequences([prediction_data])
#prediction_data_array = pad_sequences(sequences, maxlen=max_length,value=-1.0)
#print(prediction_data_array.shape)
# Check the shape of the transformed data
current_input_size = prediction_data_array.shape[1]
prediction_data_tensor = torch.tensor(prediction_data_array, dtype=torch.float32)
# If the shape doesn't match, resize it
if current_input_size != model_input_size:
print(current_input_size)
padding = model_input_size - current_input_size
prediction_data_tensor = pad(prediction_data_tensor, (0, padding), 'constant', value = 0)
mask_tensor = torch.ones_like(prediction_data_tensor)
mask_tensor[:, -padding:] = 0 # Set values in the padded region to 0
#print(torch.unique(mask_tensor, return_counts=True))
# Apply the mask to ignore certain values
#prediction_data_tensor = prediction_data_tensor * mask_tensor
# Assuming your model takes input_data as input
with torch.inference_mode():
prediction = torch.round(torch.sigmoid(model(prediction_data_tensor, mask_tensor))).squeeze()
predictions.append(round(prediction.item()))
print(f"our data tensor shape is {prediction_data_tensor.shape}")
predictions_tensor = torch.FloatTensor(predictions)
return predictions_tensor
Does anyone know any workaround that allows me to fit the data to my model withou dropping its accuracy score ?
Tried : padding the vectors when making predictions on data that is of small size
Expected : Accurate predictions similar to the results i got in training/evaluation process
Got : Inaccurate predictions of really low accuracy (around 43%)
The problem is you are creating a different featurizer for every step of your loop. This means you are giving a totally different feature vector to the model on every step.
You need to fit
the CountVectorizer
once on your train dataset, then use the same fitted CountVectorizer
to transform all the data.