Search code examples

How to autocorrect misplaced hyphen in a word?

Situation & Problem

1 .


Say, you have a paragraph.

The word sentence is broken down to sente-nce with a hyphen.

Imagine you have this sample sentence, which is a very long sente-
nce that has a word being broken down with a hyphen. 

2 .

How can I detect that word sente-nce is broken down with a hyphen, and correct it into sentence?


  • Is there any library I can use to do that (prefer Java / Python / any software)?

  • Using a simple regex to match all (\w)-(\w) & replace with $1$2, wont work in all cases.

    eg: Imagine you have a word event-driven, it will become eventdriven, which is undesired.


  • Solution (may not be the best)

    logic & usage



    1. regex match all words with hypen -

    2. loop check if those words are correct by using a dictionary

      _ & fix if they have hypen misplaced


    1. put your dictionary in Path path = Paths.get("words_alpha.txt"); <=

    2. put your sentence to autoCorrect on in content_TESTING

    3. execute & get output


    depending on the quality of the dictionary, the results may not be good.


    if your words contains "space or newline \n" -> modify the regex in String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";


    this is not fully tested yet



    package com.ex.main.autoCorrectHypen;
    import java.nio.charset.StandardCharsets;
    import java.nio.file.Files;
    import java.nio.file.Path;
    import java.nio.file.Paths;
    import java.util.Collections;
    import java.util.HashSet;
    import java.util.Set;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    1. regex match all words with hypen -
    2. loop check if those words are correct by using a dictionary
    _ & fix if they have hypen misplaced
    1. put your dictionary in `Path path = Paths.get("words_alpha.txt");` <=
    2. put your sentence to autoCorrect on in `content_TESTING`
    3. execute & get output 
    depending on the quality of the dictionary, the results may not be good. 
    if your words contains "space or newline \n" -> modify the regex in `String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";`
    this is not fully tested yet
    // ~//
    class Dictionary {
      private static HashSet<String> wordsSet = new HashSet<>();
      public static void initDictionary() throws IOException {
        Path path = Paths.get("words_alpha.txt");
        byte[] readBytes = Files.readAllBytes(path);
        String wordListContents = new String(readBytes, "UTF-8");
        String[] words = wordListContents.split("\r\n"); // @atten: \r\n or \n
        Collections.addAll(wordsSet, words);
      static {
        try {
        } catch (IOException e) {
      public static boolean contains(String word) { return wordsSet.contains(word); }
    public class AutoCorrectHypen {
      public static String autoCorrectHypen(String content_ValidateOn) {
        String content_SearchOn = content_ValidateOn;
        String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";
        Pattern pattern = Pattern.compile(str_RegexPattern);
        Matcher matcher = pattern.matcher(content_SearchOn);
        StringBuilder sb_ContentSearchOn = new StringBuilder(content_SearchOn);
        StringBuilder content_Replaced = new StringBuilder();
        int ind_MatchGroupEnd_prev = 0;
        int ind_MatchGroupEnd_curr;
        int ind_MatchGroupStart_curr;
        while (matcher.find()) {
          ind_MatchGroupStart_curr = matcher.start(0);
          ind_MatchGroupEnd_curr = matcher.end(0);
          String content_BeforeMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, ind_MatchGroupStart_curr); // prev end to curr start, not start to end
          String content_SearchOn_innerMatch_G0 =;
          String content_SearchOn_innerMatch_G1 =;
          String content_SearchOn_innerMatch_G2 =;
          String content_Replaced_innerMatch = autoCorrectHypen_innerMatch(content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
          ind_MatchGroupEnd_prev = ind_MatchGroupEnd_curr;
        // append the content after the last match group
        String content_AfterLastMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, sb_ContentSearchOn.length());
        return content_Replaced.toString();
      protected static String autoCorrectHypen_innerMatch(String content_SearchOn_innerMatch_G0, String content_SearchOn_innerMatch_G1, String content_SearchOn_innerMatch_G2) {
        System.out.printf("> %s; %s; %s; %n", content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
        String content_Replaced_innerMatch = null;
        // @atten: order of the if stmt matters
        if (Dictionary.contains(content_SearchOn_innerMatch_G0)) {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
          System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G0", content_Replaced_innerMatch);
        } else if (Dictionary.contains(content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2)) {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2;
          System.out.printf(">> %s: %n%s %n", "whole word - remove hypen, G1 + G2", content_Replaced_innerMatch);
        } else if (Dictionary.contains(content_SearchOn_innerMatch_G1) && Dictionary.contains(content_SearchOn_innerMatch_G2)) {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
          System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G1 && G2", content_Replaced_innerMatch);
        } else {
          content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
          System.err.println(">> No such word");
        return content_Replaced_innerMatch;
      static final String content_TESTING_Simple = ""
                                                   + "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
                                                   + "samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow";
      static final String content_TESTING = ""
                                            + "Imagine you have this sample sentence, which is a very long sente-\n"
                                            + "nce that has a word being broken down with a hyphen. \n"
                                            + "\n"
                                            + "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
                                            + "";
      public static void main(String[] args) throws Exception {
        System.out.println(autoCorrectHypen(content_TESTING_Simple)); // 


    Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.
    samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow


    Check the word sentence, event-driven, family-owned, chocolate-covered, anticlockwise.
    sample, difference, what-do-you-mean, however, because, otherwise, pillow