Search code examples
mysqlduplicatesinner-join

Avoiding redundant rows in result of mysql inner join


I have 2 tables

CREATE TABLE `CORPUS_FILE_SENTENCE` (
  `corpus_file_sentence_id` int(11) NOT NULL AUTO_INCREMENT,
  `corpus_file_sentence_text` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL,
  `date_sentence_saved` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  `sentence_number` int(11) NOT NULL,
  `corpus_file_id` int(11) NOT NULL,
  `comment` varchar(200) DEFAULT NULL,
  PRIMARY KEY (`corpus_file_sentence_id`),
  KEY `fk_corpus_file_sentence-corpus_file_id` (`corpus_file_id`),
  CONSTRAINT `fk_corpus_file_sentence-corpus_file_id` FOREIGN KEY (`corpus_file_id`) REFERENCES `CORPUS_FILE` (`corpus_file_id`)
) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1;

and

CREATE TABLE `CORPUS_FILE_PREPROCESSED_SENTENCE` (
  `corpus_file_preprocessed_sentence_id` int(11) NOT NULL AUTO_INCREMENT,
  `coreference_resolved_sentence_text` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `stopword_removed_sentence_text` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `lemmatized_sentence_text` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `lowercased_sentence_text` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `preprocessed_sentence_text` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `date_preprocessed_sentence_saved` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  `corpus_file_sentence_id` int(11) NOT NULL,
  `comment` varchar(200) DEFAULT NULL,
  PRIMARY KEY (`corpus_file_preprocessed_sentence_id`),
  KEY `fk_corpus_file_preprocessed_sentence-corpus_file_sentence_id` (`corpus_file_sentence_id`),
  CONSTRAINT `fk_corpus_file_preprocessed_sentence-corpus_file_sentence_id` FOREIGN KEY (`corpus_file_sentence_id`) REFERENCES `CORPUS_FILE_SENTENCE` (`corpus_file_sentence_id`)
) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=latin1;

I need to show the result of Coreference Resolution along with the original sentence and I was trying

select 
  CFS.corpus_file_sentence_id,
  CFS.sentence_number,
  CFS.corpus_file_sentence_text,
  CFPS.coreference_resolved_sentence_text
from CORPUS_FILE_SENTENCE CFS
inner join CORPUS_FILE_PREPROCESSED_SENTENCE CFPS
  on  CFS.corpus_file_sentence_id = CFPS.corpus_file_sentence_id
  and CFS.corpus_file_id = 3;

This is resulting in duplicate rows of the form

enter image description here

How do I remove the redundant rows?


Solution

  • It is really strange that you get these results, but for these results you can remove the rows with nulls in the column coreference_resolved_sentence_text with group by and aggregation:

    select 
      CFS.corpus_file_sentence_id, CFS.sentence_number, 
      CFS.corpus_file_sentence_text, max(CFPS.coreference_resolved_sentence_text) 
    from CORPUS_FILE_SENTENCE CFS inner join CORPUS_FILE_PREPROCESSED_SENTENCE CFPS 
    on CFS.corpus_file_sentence_id = CFPS.corpus_file_sentence_id and CFS.corpus_file_id = 3
    group by 
      CFS.corpus_file_sentence_id, 
      CFS.sentence_number, 
      CFS.corpus_file_sentence_text