In addition to find whole word, my algorithme should take in account the fact that my text is iso-latin encoding (french word), thus it contains some accents caracters.
Take look on this code snipet and i wish you enjoy.
import java.text.Normalizer;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.TestCase;
public class TestRegex extends TestCase{
public void testRegex(){
String INPUT = "l'postéEç! toto tata a problème à probleme";
List<String> listCnil = Arrays.asList(new String[]{"l'postéeç!", "a problème"});
String unAccentInput = unAccent(INPUT);
Set<String> inputWords = new HashSet<String>();
for (String word : listCnil) {
word = unAccent(word);
Pattern p = Pattern.compile("\\b\\Q" + word + "\\E\\b", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(unAccentInput);
while (m.find()) {
String tt = INPUT.substring(m.start(), m.end());
inputWords.add(tt);
}
}
for (String str : inputWords) {
Pattern p = Pattern.compile(\\b\\Q"+str+"\\E\\b", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(INPUT);
StringBuffer sb = new StringBuffer();
while (m.find()) {
String REMP = "<p>"+m.group()+"</p>";
m.appendReplacement(sb, REMP);
}
m.appendTail(sb);
INPUT=sb.toString();
}
System.out.println(INPUT);
}
public static String unAccent(String s) {
String temp = Normalizer.normalize(s, Normalizer.Form.NFD);
Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
return pattern.matcher(temp).replaceAll("");
}
}