1 .
eg:
Say, you have a paragraph.
The word sentence
is broken down to sente-nce
with a hyphen.
Imagine you have this sample sentence, which is a very long sente-
nce that has a word being broken down with a hyphen.
2 .
How can I detect that word sente-nce
is broken down with a hyphen, and correct it into sentence
?
note:
Is there any library I can use to do that (prefer Java / Python / any software)?
Using a simple regex to match all (\w)-(\w)
& replace with $1$2
, wont work in all cases.
eg: Imagine you have a word event-driven
, it will become eventdriven
, which is undesired.
/*
@logic::
regex match all words with hypen -
loop check if those words are correct by using a dictionary
_ & fix if they have hypen misplaced
@to_use::
put your dictionary in Path path = Paths.get("words_alpha.txt");
<= https://github.com/dwyl/english-words
put your sentence to autoCorrect on in content_TESTING
execute & get output
@note::
depending on the quality of the dictionary, the results may not be good.
@note::
if your words contains "space or newline \n" -> modify the regex in String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";
@note::
this is not fully tested yet
*/
package com.ex.main.autoCorrectHypen;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
@logic::
1. regex match all words with hypen -
2. loop check if those words are correct by using a dictionary
_ & fix if they have hypen misplaced
@to_use::
1. put your dictionary in `Path path = Paths.get("words_alpha.txt");` <= https://github.com/dwyl/english-words
2. put your sentence to autoCorrect on in `content_TESTING`
3. execute & get output
@note::
depending on the quality of the dictionary, the results may not be good.
@note::
if your words contains "space or newline \n" -> modify the regex in `String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";`
@note::
this is not fully tested yet
*/
// https://stackoverflow.com/questions/11607270/how-to-check-whether-given-string-is-a-word
// https://github.com/dwyl/english-words
// ~// https://github.com/first20hours/google-10000-english
class Dictionary {
private static HashSet<String> wordsSet = new HashSet<>();
public static void initDictionary() throws IOException {
Path path = Paths.get("words_alpha.txt");
byte[] readBytes = Files.readAllBytes(path);
String wordListContents = new String(readBytes, "UTF-8");
String[] words = wordListContents.split("\r\n"); // @atten: \r\n or \n
Collections.addAll(wordsSet, words);
}
static {
try {
initDictionary();
} catch (IOException e) {
e.printStackTrace();
}
}
public static boolean contains(String word) { return wordsSet.contains(word); }
}
public class AutoCorrectHypen {
public static String autoCorrectHypen(String content_ValidateOn) {
String content_SearchOn = content_ValidateOn;
String str_RegexPattern = "([a-zA-Z]+)-([a-zA-Z]+)";
Pattern pattern = Pattern.compile(str_RegexPattern);
Matcher matcher = pattern.matcher(content_SearchOn);
StringBuilder sb_ContentSearchOn = new StringBuilder(content_SearchOn);
StringBuilder content_Replaced = new StringBuilder();
int ind_MatchGroupEnd_prev = 0;
int ind_MatchGroupEnd_curr;
int ind_MatchGroupStart_curr;
while (matcher.find()) {
//
ind_MatchGroupStart_curr = matcher.start(0);
ind_MatchGroupEnd_curr = matcher.end(0);
String content_BeforeMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, ind_MatchGroupStart_curr); // prev end to curr start, not start to end
content_Replaced.append(content_BeforeMatchGroup);
//
String content_SearchOn_innerMatch_G0 = matcher.group(0);
String content_SearchOn_innerMatch_G1 = matcher.group(1);
String content_SearchOn_innerMatch_G2 = matcher.group(2);
String content_Replaced_innerMatch = autoCorrectHypen_innerMatch(content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
content_Replaced.append(content_Replaced_innerMatch);
//
ind_MatchGroupEnd_prev = ind_MatchGroupEnd_curr;
}
System.out.println("-------");
// append the content after the last match group
String content_AfterLastMatchGroup = sb_ContentSearchOn.substring(ind_MatchGroupEnd_prev, sb_ContentSearchOn.length());
content_Replaced.append(content_AfterLastMatchGroup);
return content_Replaced.toString();
}
protected static String autoCorrectHypen_innerMatch(String content_SearchOn_innerMatch_G0, String content_SearchOn_innerMatch_G1, String content_SearchOn_innerMatch_G2) {
System.out.printf("> %s; %s; %s; %n", content_SearchOn_innerMatch_G0, content_SearchOn_innerMatch_G1, content_SearchOn_innerMatch_G2);
String content_Replaced_innerMatch = null;
// @atten: order of the if stmt matters
if (Dictionary.contains(content_SearchOn_innerMatch_G0)) {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G0", content_Replaced_innerMatch);
} else if (Dictionary.contains(content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2)) {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G1 + content_SearchOn_innerMatch_G2;
System.out.printf(">> %s: %n%s %n", "whole word - remove hypen, G1 + G2", content_Replaced_innerMatch);
} else if (Dictionary.contains(content_SearchOn_innerMatch_G1) && Dictionary.contains(content_SearchOn_innerMatch_G2)) {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
System.out.printf(">> %s: %n%s %n", "whole word - with hypen, G1 && G2", content_Replaced_innerMatch);
} else {
content_Replaced_innerMatch = content_SearchOn_innerMatch_G0;
System.err.println(">> No such word");
}
return content_Replaced_innerMatch;
}
//################################################################################################
static final String content_TESTING_Simple = ""
+ "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
+ "samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow";
static final String content_TESTING = ""
+ "Imagine you have this sample sentence, which is a very long sente-\n"
+ "nce that has a word being broken down with a hyphen. \n"
+ "\n"
+ "Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.\n"
+ "";
public static void main(String[] args) throws Exception {
System.out.println(autoCorrectHypen(content_TESTING_Simple)); //
}
}
input
Check the word sente-nce, event-driven, family-owned, chocolate-covered, anti-clockwise.
samp-le, diff-erence, what-do-you-mean, how-ever, be-cause, other-wise, pill-ow
output
Check the word sentence, event-driven, family-owned, chocolate-covered, anticlockwise.
sample, difference, what-do-you-mean, however, because, otherwise, pillow