Search code examples
javatwittertwitter4j

Segregating filtered tweets based on matched keywords : Twitter4j API


I have created twitter stream filtered by some keywords as follows.

TwitterStream twitterStream = getTwitterStreamInstance();
FilterQuery filtre = new FilterQuery();
String[] keywordsArray = { "iphone", "samsung" , "apple", "amazon"};
filtre.track(keywordsArray);
twitterStream.filter(filtre);
twitterStream.addListener(listener);

What is the best way to segregate tweets based on keywords matched. e.g. All the tweets that matches "iphone" should be stored into "IPHONE" table and all the tweets that matches "samsung" will be stored into "SAMSUNG" table and so on. NOTE: The no of filter keywords is about 500.


Solution

  • It seems that the only way to find out to which keyword a tweet belongs to is iterating over multiple properties of the Status object. The following code requires a database service with a method insertTweet(String tweetText, Date createdAt, String keyword) and every tweet is stored in the database multiple times, if multiple keywords are found. If at least one keyword is found in the tweet text, the additional properties are not searched for more keywords.

    // creates a map of the keywords with a compiled pattern, which matches the keyword
    private Map<String, Pattern> keywordsMap = new HashMap<>();
    private TwitterStream twitterStream;
    private DatabaseService databaseService; // implement and add this service
    
    public void start(List<String> keywords) {
        stop(); // stop the streaming first, if it is already running
    
        if(keywords.size() > 0) {
            for(String keyword : keywords) {
                keywordsMap.put(keyword, Pattern.compile(keyword, Pattern.CASE_INSENSITIVE));
            }
    
            twitterStream = new TwitterStreamFactory().getInstance();
            StatusListener listener = new StatusListener() {
                @Override
                public void onStatus(Status status) {
                    insertTweetWithKeywordIntoDatabase(status);
                }
                /* add the unimplemented methods from the interface */
            };
            twitterStream.addListener(listener);
            FilterQuery filterQuery = new FilterQuery();
            filterQuery.track(keywordsMap.keySet().toArray(new String[keywordsMap.keySet().size()]));
            filterQuery.language(new String[]{"en"});
    
            twitterStream.filter(filterQuery);
        }
        else {
            System.err.println("Could not start querying because there are no keywords.");
        }
    }
    
    public void stop() {
        keywordsMap.clear();
        if(twitterStream != null) {
            twitterStream.shutdown();
        }
    }
    
    private void insertTweetWithKeywordIntoDatabase(Status status) {
        // search for keywords in tweet text
        List<String> keywords = getKeywordsFromTweet(status.getText());
    
        if (keywords.isEmpty()) {
            StringBuffer additionalDataFromTweets = new StringBuffer();
    
            // get extended urls
            if (status.getURLEntities() != null) {
                for (URLEntity url : status.getURLEntities()) {
                    if (url != null && url.getExpandedURL() != null) {
                        additionalDataFromTweets.append(url.getExpandedURL());
                    }
                }
            }
    
            // get retweeted status -> text
            if (status.getRetweetedStatus() != null && status.getRetweetedStatus().getText() != null) {
                additionalDataFromTweets.append(status.getRetweetedStatus().getText());
            }
            // get retweeted status -> quoted status -> text
            if (status.getRetweetedStatus() != null && status.getRetweetedStatus().getQuotedStatus() != null
                    && status.getRetweetedStatus().getQuotedStatus().getText() != null) {
                additionalDataFromTweets.append(status.getRetweetedStatus().getQuotedStatus().getText());
            }
            // get retweeted status -> quoted status -> extended urls
            if (status.getRetweetedStatus() != null && status.getRetweetedStatus().getQuotedStatus() != null
                    && status.getRetweetedStatus().getQuotedStatus().getURLEntities() != null) {
                for (URLEntity url : status.getRetweetedStatus().getQuotedStatus().getURLEntities()) {
                    if (url != null && url.getExpandedURL() != null) {
                        additionalDataFromTweets.append(url.getExpandedURL());
                    }
                }
            }
    
            // get quoted status -> text
            if (status.getQuotedStatus() != null && status.getQuotedStatus().getText() != null) {
                additionalDataFromTweets.append(status.getQuotedStatus().getText());
            }
            // get quoted status -> extended urls
            if (status.getQuotedStatus() != null && status.getQuotedStatus().getURLEntities() != null) {
                for (URLEntity url : status.getQuotedStatus().getURLEntities()) {
                    if (url != null && url.getExpandedURL() != null) {
                        additionalDataFromTweets.append(url.getExpandedURL());
                    }
                }
            }
    
            String additionalData = additionalDataFromTweets.toString();
            keywords = getKeywordsFromTweet(additionalData);
        }
    
        if (keywords.isEmpty()) {
            System.err.println("ERROR: No Keyword found for: " + status.toString());
    
        } else {
            // insert into database
            for(String keyword : keywords) {
                databaseService.insertTweet(status.getText(), status.getCreatedAt(), keyword);
            }
        }
    
    }
    
    // returns a list of keywords which are found in a tweet
    private List<String> getKeywordsFromTweet(String tweet) {
        List<String> result = new ArrayList<>();
    
        for (String keyword : keywordsMap.keySet()) {
            Pattern p = keywordsMap.get(keyword);
            if (p.matcher(tweet).find()) {
                result.add(keyword);
            }
        }
    
        return result;
    }