I am trying to write a SPAM filter program that marks messages as SPAM based on a list of keylists or a list of blacklisted senders. The program takes 3 text file inputs, a text file of email, keywords and backlistedsender and gives one output. I am trying to parse the email text file into separate messages then take the sender, subject and content of the message and set them in strings. I'm having a problem splitting the string and only taking the words I need. The only way I have seen to do this is using regrex and substituting the characters for "" but regrex is still pretty foreign to me so I think I might not be doing it right. Below is my code I am pretty sure the issue is caused by the set methods in message but I can't figure out what to do about it. Any suggestions would be appreciated.
import java.util.Scanner;
public class Message {
int id;
String sender, subject, content;
Scanner contentIn;
// Takes a text file
Message(String fullMessageContent){
// Parse text file and set variables
contentIn = new Scanner(fullMessageContent);
setID(contentIn);
setSender(contentIn);
setSubject(contentIn);
setContent(contentIn);
}
private void setContent(Scanner scin) {
// TODO Auto-generated method stub
boolean record = false;
String contentRecord = "";
String input;
while(scin.hasNext()){
input = scin.nextLine();
if(input.contains("Message Body:")){
record = true;
}
while(record==true){
contentRecord += scin.nextLine();
}
}
content = contentRecord;
}
private void setSubject(Scanner scin) {
// TODO Auto-generated method stubd
// Regrex expression - \\s* removes whitespace \\b is bounds
String regrex = "\\s*\\bSubject:\\b\\s*";
while(scin.hasNext()){
String string = scin.next();
if(string.contains("Subject:")){
// Cuts 'Subject:' out of the line and sets the subject variable to the remainder of the line
subject = string.replaceAll(regrex, "");
break;
}
}
}
private void setID(Scanner scin) {
// TODO Auto-generated method stub
// Regrex expression - \\s* removes whitespace \\b is bounds [^(<.*?>)] = not the characters between <>
String regrex = "\\s*\\b[^(<.*?>)]\\b\\s*";
String string = "";
while(scin.hasNext()){
string = scin.nextLine();
if(string.contains("MIN:")){
// Cuts 'MIN:' out of the line and sets the ID int variable to the number between the brackets
// id = Integer.parseInt(string.replaceAll(regrex, ""));
**I know that this regrex expression is not working because the println does not give desired number**
System.out.println(string.replaceAll(regrex, ""));
break;
}
}
}
private void setSender(Scanner scin) {
// TODO Auto-generated method stub
// Regrex expression - \\s* removes whitespace \\b is bounds
String regrex = "\\s*\\From:\\b\\s*";
while(scin.hasNext()){
String string = scin.next();
if(string.contains("From:")){
// Removes <> and From: leaving only the email address
String from = string.replaceAll(regrex, "");
from = from.replace("<", "");
from = from.replace(">", "");
// Trims whitespace on both sides
sender = from.trim();
break;
}
}
}
public int getID(){
return id;
}
public String getSubject(){
return subject;
}
public String getSender(){
return sender;
}
public String getContent(){
return content;
}
}
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.StringTokenizer;
public class Filter {
ArrayList<String> blackListedSenders, keywords;
PrintWriter output;
Filter(String keywordinput, String blackListinput) {
blackListedSenders = new ArrayList<String>();
keywords = new ArrayList<String>();
setUpBlackList(blackListinput);
setUpKeywords(keywordinput);
outputFileSetUp();
}
private void setUpKeywords(String keywordinput) {
// TODO Auto-generated method stub
Scanner keyin = new Scanner(keywordinput);
String keyword;
while (keyin.hasNext()) {
keyword = keyin.next();
keywords.add(keyword);
}
}
private void setUpBlackList(String blackListinput) {
// TODO Auto-generated method stub
Scanner blin = new Scanner(blackListinput);
String sender;
while (blin.hasNext()) {
sender = blin.next();
blackListedSenders.add(sender);
}
}
private void outputFileSetUp() {
// TODO Auto-generated method stub
String outputHeader = "The following Message Identificaion Numbers have been associated with SPAM:";
try {
output = new PrintWriter(new FileWriter("MessagesMarkedAsSpam.txt"));
output.println(outputHeader);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void searchMessage(Message m) {
String sender = m.getSender();
String content = m.getContent();
if (searchForKeywords(content) == true) {
// Print MIN to file
markSpam(m);
} else if (searchForBlackListedSenders(sender) == true) {
// Print MIN to file
markSpam(m);
}
output.close();
}
private boolean searchForBlackListedSenders(String sender) {
for (int count = 0; count < blackListedSenders.size(); count++) {
if (sender == blackListedSenders.get(count)) {
return true;
}
}
return false;
}
private boolean searchForKeywords(String content) {
// TODO Auto-generated method stub
Scanner contentin = new Scanner(content);
String word;
while (contentin.hasNext()) {
for (int count = 0; count < keywords.size(); count++) {
word = contentin.next();
if (word == keywords.get(count)) {
return true;
}
}
}
return false;
}
private void markSpam(Message m) {
// TODO Auto-generated method stub
output.println(m.getID());
updateLists(m);
}
private void updateLists(Message m) {
// TODO Auto-generated method stub
updateKeywords(m);
updateBlacklist(m);
}
private void updateBlacklist(Message m) {
// TODO Auto-generated method stub
String sender = m.getSender();
if (isInBlackList(m) == true) {
} else {
blackListedSenders.add(sender);
}
}
private boolean isInBlackList(Message m) {
String sender = m.getSender();
boolean isPresent = false;
for (int count = 0; count < keywords.size(); count++) {
if (sender == keywords.get(count)) {
isPresent = true;
}
}
if (isPresent == true) {
return true;
} else {
return false;
}
}
private void updateKeywords(Message m) {
// TODO Auto-generated method stub
String content = m.getContent();
findKeywords(content);
}
private void findKeywords(String content) {
// TODO Auto-generated method stub
content.replaceAll("\\b[\\w']{1,2,3,4,5}\\b", "");
StringTokenizer st = new StringTokenizer(content, " ");
while (st.hasMoreElements()) {
String currentWord = st.nextToken();
if (!isInKeywords(currentWord)) {
keywords.add(currentWord);
}
}
}
private boolean isInKeywords(String currentWord) {
// TODO Auto-generated method stub
boolean isPresent = false;
for (int count = 0; count < keywords.size(); count++) {
if (currentWord == keywords.get(count)) {
isPresent = true;
break;
}
}
if (isPresent == true) {
return true;
} else {
return false;
}
}
}
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Scanner;
public class MessageHandler {
static ArrayList<Message> messages = new ArrayList<Message>();
public static void main(String[] args) {
// TODO Auto-generated method stub
String blacklist, keywords;
Filter spamFinder;
String currentLine = "";
String messageText = "";
Message foundMessage;
boolean again = true;
boolean record = false;
while (again == true) {
try {
File mailfile = new File(args[0]);
File keywordsfile = new File(args[1]);
File blacklistfile = new File(args[2]);
Scanner mailIn = new Scanner(mailfile);
Scanner keywordsIn = new Scanner(keywordsfile);
Scanner blacklistIn = new Scanner(blacklistfile);
while (mailIn.hasNext() == true) {
currentLine = mailIn.nextLine();
if (currentLine.contains("<BEGIN>")) {
record = true;
currentLine = mailIn.nextLine();
}
while (record == true) {
messageText += currentLine;
currentLine = mailIn.nextLine();
if (currentLine.contains("<END>")) {
record = false;
foundMessage = new Message(messageText);
messages.add(foundMessage);
messageText = "";
}
}
}
keywords = setUpKeywords(keywordsIn);
blacklist = setUpBlacklist(blacklistIn);
spamFinder = new Filter(keywords, blacklist);
for (int count = 0; count < messages.size(); count++) {
spamFinder.searchMessage(messages.get(count));
}
again = false;
} catch (IOException e) {
System.out.println("Please try again with correct input file name");
Scanner scan = new Scanner(System.in);
args[0] = scan.next();
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("usage: MessageHandler <mail.txt> <keywords.txt> <blacklist.txt>");
again = false;
}
;
}
}
private static String setUpBlacklist(Scanner blacklistIn) {
// TODO Auto-generated method stub
String blackList = "";
while (blacklistIn.hasNext()) {
blackList += blacklistIn.nextLine();
}
return blackList;
}
private static String setUpKeywords(Scanner keywordsIn) {
// TODO Auto-generated method stub
String keywords = "";
while (keywordsIn.hasNext()) {
keywords += keywordsIn.nextLine();
}
return keywords;
}
}
This is an example of the type of text file containing multiple email messages that I want to take as input. I am trying to use a scanner with a string input to try and take the text between the <BEGIN>
and <END>
and set up new messages with it. Then within the new message try to take the line that says from
, strip everything but the email, subject, and remove Subject:
and everything remove message body and everything above it and the end tag.
<BEGIN>
From: "carlyle mathe" <noblepierrette@yahoo.com>
To: "olin alonso" <cannon@cs.columbia.edu>
Date: Thu, 5 Apr 2007 22:49:39 -0500
Subject: Generic brand pharmecuticals at a lower price
MIN: <1005001>
Message Body:
Generic brand medications shipped quickly and discreetly to your front
door
http://www.reliefk.org/
------=_NextPart_000_109BBF_01C777D4.A5975940
Content-Type: text/html;
charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META http-equiv=3DContent-Type content=3D"text/html; =
charset=3Diso-8859-1">
<META content=3D"MSHTML 6.00.5730.11" name=3DGENERATOR>
<STYLE></STYLE>
</HEAD><FONT face=3DArial><FONT size=3D2>
<BODY>
<DIV>Generic brand medications shipped quickly and discreetly to your =
front door</DIV>
<DIV> </DIV><DIV><A =
href=3D"http://www.reliefk.org/">http://www.reliefk.org/</A></DIV></BODY>=
</HTML>
<END>
<BEGIN>
From: "yankees.com Shop" <feedback@lists.mlb.com>
To: cannon@cs.columbia.edu
Date: Tue, 3 Apr 2007 04:25:24 -0400 (EDT)
Subject: Shop our Youth collection: We've got gear for little fans too!
MIN: <1005002>
Message Body:
We've got great gear for future rookies!
The season is underway and we've stocked up on stuff for all ages.
Shop now >>
http://click.mlb.com/ct/click?q=b4-aJ4bQ6Z1xbiwoAvqJ7GfyUon
MINOR LEAGUE CLEARANCE
Buy one Minor League item from our outlet, get a second outlet item (of
equal or lesser value) for $1!
Offer expires 4.9.07 at 11:59 pm ET.
Shop now >>
http://click.mlb.com/ct/click?q=de-7tzkQMWaOpwc526tTKGtXNJX
............................................................
(c) 2007 MLB Advanced Media, L.P. All rights reserved.
All Major League Baseball trademarks and service marks used
herein are the property of the applicable MLB entity. All other marks
used herein are trademarks or registered trademarks of their
respective owners.
<END>
<BEGIN>
From: WindowsDailyNews <windowsdailynews@ientrynetwork.net>
To: cannon@mts.jhu.edu
Date: Fri, 06 Apr 2007 09:31:32 -0400
Subject: Think about servers inside and outside the box
MIN: <1005003>
Message Body:
Dear Prof. Sir/Madam,
I am forwarding herewith the call for papers. Please circulate it in
your network.
Thanking You,
ACVIT 2007
International Conference on Advances in Computer Vision and
Information Technology
www.acvit.org
28th -30th November 2007
Organized by
Dr. Babasaheb Ambedkar Marathwada University Aurangabad (MS) 431004 India .
Department of Computer Science and Information Technology
Co-Sponsored By IEEE - Bombay Section
Aim
Department of Computer Science and Information Technology will
organize the ACVIT-07 on the auspicious occasion of Golden Jubilee
Year August 2007-08 of Dr. Babasaheb Ambedkar Marathwada University,
Aurangabad (MS)-India.
ACVIT-07 will provide, a high quality forum for academician,
engineers, industrialists, scientists and researchers engaged in
teaching, research and development of Computer Science and Information
Technology to spark ideas and share their views to solve the complex
design challenges and trends the face. Accepted papers will be
published in the Proceedings of ACVIT-07. A contest for the student
Best Paper Award will be held and an award will be given. The regular
technical program will run for three days along with an exhibition of
Computer Vision and Information Technology products. In addition,
tutorial sessions will be held on the first day of the conference.
Papers describing original work are invited in any of the following
areas but not limited to:
Important Dates:
Full Paper Submission: 15th May 2007
Tutorials/Special Session Submission: 15th May 2007
Notification of Acceptance: 15 July 2007
Camera Ready: 15th August 2007
Conference Registration: 15th August 2007
Paper Submission page is under construction, you can submit your
manuscript/full length paper. Via e-mail for details visit
www.acvit.org
Contact:
Dr. K. V. Kale,
Conference Chair,
Professor,
Dept. of Computer Science and Information Technology,
Dr. Babasaheb Ambedkar Marathwada University, Aurangabad (MS) 1004 INDIA
Phone: +91-240-2400431 ext 212
Fax No. : +91-240-2400441
Mobile: +91-9422203089
Email: acvit2007@gmail.com kvkale91@gmail.com
-------------------------------------------------------------------------
Dr. K. V. Kale,
Ph.D. FIETE, Member IEEE, IEEE-SA
Conference General Chair -ACVIT2007
Professor,
Dept. of Computer Science and IT,
Dr. Babasaheb Ambedkar Marathwada University,
Aurangabad. (MS) India. Pin: - 431001
Phone No. (91-240)2400431-37 ext: 461,534
Resi: (91-240)2442118
Mobile: - 91-9422203089
URL: www.bamu.net
Web: www.kvkresearchgroup.esmartweb.com
E-mail: kvkale91@ieee.org / kvkale91@rediffmail.com
<END>
When the hasNext
method on a Scanner
returns false
, it will always return false
thereafter. It does not go back to the start of the text for you.